## Thanks to Arsalan Esmaili (UW PhD student) for the implementation.
### If you have questions on this, highly recommend directing them to him (arsalan@uw.edu) or  to Stack Overflow/Reddit.

In [1]:
%pip install selenium

Collecting selenium
  Downloading selenium-4.26.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.26.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownlo

In [2]:
import os
#os.chdir('') #change to the directory where you want to save the csv file

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re

# Initialize WebDriver
driver = webdriver.Chrome()

# Open the website
driver.get("https://www.avcrashes.net/")

# Function to extract crash details
def extract_crash_details():
    details = {}
    wait = WebDriverWait(driver, 20)  # wait time
    
    # Wait for the detail panel to be present and visible
    try:
        panel = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[contains(@class, 'MuiBox-root') and contains(@class, 'css-0')]")))
        print("Detail panel found.")
    except Exception as e:
        print(f"Error waiting for detail panel: {e}")
        return None

    # Define the elements to extract (you can see these elements by right clicking on the website and selecting 'Inspect')
    elements = [
        ("Country", "Country"),
        ("State", "State"),
        ("Incident description", "Incident description"),
        ("Vehicle model", "Vehicle model"),
        ("Mode", "Mode"),
        ("Damage description", "Damage description"),
        ("Injury description", "Injury description"),
        ("Vehicle status", "Vehicle status"),
        ("Fault", "Fault"),
        ("Weather", "Weather"),
        ("Time of Day", "Time of Day"),
        ("Road Conditions", "Road Conditions"),
        ("Driverless", "Driverless"),
        ("Collision Type", "Collision Type"),
        ("Vehicle type", "Vehicle type")
    ]
    
    for label, key in elements:
        try:
            element = wait.until(EC.presence_of_element_located((By.XPATH, f"//h6[text()='{label}']/following-sibling::p")))
            details[key] = element.text
            print(f"Extracted {label}: {element.text}")
        except Exception as e:
            print(f"Error extracting {label}: {e}")
            details[key] = "N/A"

    # Extract Date and Time
    try:
        date_time_element = wait.until(EC.presence_of_element_located((By.XPATH, "//header//span[contains(@class, 'MuiTypography-root') and contains(@class, 'MuiTypography-caption')]")))
        date_time_text = date_time_element.text
        date, time_ = date_time_text.split()
        details["Date"] = date
        details["Time"] = time_
        print(f"Extracted Date: {date}, Time: {time_}")
    except Exception as e:
        print(f"Error extracting Date and Time: {e}")
        details["Date"] = "N/A"
        details["Time"] = "N/A"

    # Extract Google Street View link and coordinates
    try:
        street_view_element = driver.find_element(By.XPATH, "//a[contains(@href, 'google.com/maps')]")
        details["Google Street View"] = street_view_element.get_attribute("href")
        print(f"Extracted Google Street View: {details['Google Street View']}")

        # Extract coordinates from the link if available
        coords = re.search(r"viewpoint=(?P<lat>-?\d+\.\d+),(?P<lon>-?\d+\.\d+)", details["Google Street View"])
        if coords:
            details["Latitude"] = coords.group("lat")
            details["Longitude"] = coords.group("lon")
            print(f"Extracted Coordinates: {details['Latitude']}, {details['Longitude']}")
        else:
            details["Latitude"] = "N/A"
            details["Longitude"] = "N/A"
    except Exception as e:
        print(f"Error extracting Google Street View link or coordinates: {e}")
        details["Google Street View"] = "N/A"
        details["Latitude"] = "N/A"
        details["Longitude"] = "N/A"
    
    return details

# Function to wait for the user to click markers and collect details
def collect_data_for_all_clicks():
    crash_data = []
    wait = WebDriverWait(driver, 10)
    
    try:
        while True:
            user_input = input("Click on a marker and press Enter to collect the information, or type 'q' to quit: ")
            if user_input.lower() == 'q':
                break

            details = None
            retries = 3

            while retries > 0:
                try:
                    details = extract_crash_details()
                    if details:
                        crash_data.append(details)
                        print("Information collected.")
                        break
                except Exception as e:
                    print(f"Error during extraction: {e}")

                retries -= 1
                if retries > 0:
                    print(f"Retrying... ({retries} retries left)")
            
            if details is None:
                print("Failed to collect information after multiple attempts. Please try another marker.")
            
            print("Press Enter to continue or type 'q' to quit.")
    except KeyboardInterrupt:
        print("Data collection stopped by user.")
    
    return crash_data

# Collect crash data
crash_data = collect_data_for_all_clicks()

# Create a pandas dataframe and save to Excel
df = pd.DataFrame(crash_data)
df.to_excel("crash_data.xlsx", index=False)
print("Data saved to crash_data.xlsx")

# Close the WebDriver
driver.quit()

Detail panel found.
Extracted Country: United States
Extracted State: Arizona
Extracted Incident description: The AV driver was proceeding straight at 65 mph, upon reaching to the dashboard, the steering wheel moved fully to the left, the AV driver tried to correct this movement, however, the AV crossed the leftmost lane and hit the median in the middle.
Extracted Vehicle model: International LT 625
Extracted Mode: Autonomous
Extracted Damage description: Front left corner
Extracted Injury description: No injury
Extracted Vehicle status: Moving
Extracted Fault: AV technology
Extracted Weather: Clear
Extracted Time of Day: Daylight
Extracted Road Conditions: Dry
Extracted Driverless: No
Extracted Collision Type: Hit object
Extracted Vehicle type: Semi-trailer truck
Extracted Date: 05/04/2022, Time: 15:52
Extracted Google Street View: https://www.google.com/maps/@?api=1&map_action=pano&viewpoint=32.241143,-110.986024
Extracted Coordinates: 32.241143, -110.986024
Information collected.
Pr