In [2]:
import requests
from pprint import pprint
import pandas as pd
from datetime import datetime
import concurrent.futures

In [3]:
#Base URL for API call
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/crashes/"

fatal_cases = []

# writing a for loop for all 50 states and from 2018 - 2021 so it doesn't exceed API call limit
for state in range(50):
    for y_since_2018 in range(4):
        response  = requests.get(base_url + f"GetCaseList?states={state}&fromYear={2018 + y_since_2018}&toYear={2018 + y_since_2018}&minNumOfVehicles=0&maxNumOfVehicles=10&format=json")
        data = response.json()
        fatal_cases += data["Results"][0]

In [4]:
df = pd.DataFrame(fatal_cases)
df.head()

Unnamed: 0,CountyName,CrashDate,Fatals,Peds,Persons,St_Case,State,StateName,TotalVehicles
0,TALLADEGA (121),/Date(1515150000000-0500)/,1,0,1,10001,1,Alabama,2
1,WALKER (127),/Date(1515390480000-0500)/,2,0,2,10002,1,Alabama,1
2,CHILTON (21),/Date(1515469800000-0500)/,1,0,2,10003,1,Alabama,2
3,BALDWIN (3),/Date(1515520920000-0500)/,1,0,2,10004,1,Alabama,1
4,JEFFERSON (73),/Date(1516363740000-0500)/,1,0,2,10005,1,Alabama,2


In [5]:
#filter out the problematic dates
df_filtered = df[df["CrashDate"] != "/Date(-62135578800000-0500)/"]
df_filtered.loc[:, "CrashDate"] = df_filtered["CrashDate"].apply(lambda x: datetime.fromtimestamp(int(x[6:19])/1000))

df_filtered = df_filtered.sort_values(by = "CrashDate")

df_filtered = df_filtered.reset_index(drop=True)

df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.loc[:, "CrashDate"] = df_filtered["CrashDate"].apply(lambda x: datetime.fromtimestamp(int(x[6:19])/1000))


Unnamed: 0,CountyName,CrashDate,Fatals,Peds,Persons,St_Case,State,StateName,TotalVehicles
0,RIVERSIDE (65),2017-12-31 23:01:00,1,0,4,63463,6,California,2
1,LAKE (69),2017-12-31 23:15:00,1,0,1,120156,12,Florida,1
2,DOUGLAS (55),2017-12-31 23:20:00,1,0,2,310001,31,Nebraska,2
3,GREGG (183),2017-12-31 23:22:00,1,0,1,480010,48,Texas,1
4,DALLAS (113),2017-12-31 23:30:00,2,0,2,480006,48,Texas,2


In [6]:
# list to hold crash case details
crash_details = []

def worker(lower, upper):
    for idx in range(lower, upper):
        row = df_filtered.iloc[idx]
        state_case = row["St_Case"]
        state_case_year = row["CrashDate"].strftime('%Y')  
        state_number = row["State"]

        # Information displaying throughout the pulling of data
        if idx % 500 == 0:
            print(f"Starting the {idx}th run")

        try:
            base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/crashes/"
            response = requests.get(base_url + f"GetCaseDetails?stateCase={state_case}&caseYear={state_case_year}&state={state_number}&format=json")
            data = response.json()

            # Info that we need to pull from the JSON API
            crash_result = data["Results"][0][0]["CrashResultSet"]
            case_year = crash_result["CaseYear"]
            day_week = crash_result["DAY_WEEK"]
            day_week_name = crash_result["DAY_WEEKNAME"]
            hour = crash_result["HOUR"]
            latitude = crash_result["LATITUDE"]
            longitude = crash_result["LONGITUD"]
            month_name = crash_result["MonthName"]
            route_name = crash_result["ROUTENAME"]
            state_name = crash_result["STATENAME"]
            st_case = crash_result["ST_CASE"]

            # Add the data to the crash_details list
            crash_details.append({
                "CaseYear": case_year,
                "DAY_WEEK": day_week,
                "DAY_WEEKNAME": day_week_name,
                "HOUR": hour,
                "LATITUDE": latitude,
                "LONGITUD": longitude,
                "MonthName": month_name,
                "ROUTENAME": route_name,
                "STATENAME": state_name,
                "ST_CASE": st_case
            })
            
        except Exception as e:
            print(f"Error on index {idx}: {e}")

# Define the number of workers and calculate the range of indices they will process
num_workers = 10
total_rows = len(df_filtered)
chunk_size = total_rows // num_workers
ranges = [(i * chunk_size, (i + 1) * chunk_size) for i in range(num_workers)]
ranges[-1] = (ranges[-1][0], total_rows)  # Adjust last range to include all rows

# Execute the worker function using concurrent threads
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [executor.submit(worker, lower, upper) for lower, upper in ranges]
    concurrent.futures.wait(futures)

# After all threads complete, the crash_details list will contain the required information. This data can be halted and exported without fully completing the run.  

    

Starting the 0th run
Starting the 53500th run
Starting the 107000th run
Starting the 13500th run
Starting the 67000th run
Starting the 120500th run
Starting the 27000th run
Starting the 80500th run
Starting the 40500th run
Starting the 94000th run
Starting the 500th run
Starting the 54000th run
Starting the 14000th run
Starting the 107500th run
Starting the 67500th run
Starting the 27500th run
Starting the 121000th run
Starting the 81000th run
Starting the 41000th run
Starting the 94500th run
Starting the 1000th run
Starting the 54500th run
Starting the 14500th run
Starting the 108000th run
Starting the 68000th run
Starting the 28000th run
Starting the 121500th run
Starting the 81500th run
Starting the 41500th run
Starting the 1500th run
Starting the 95000th run
Starting the 55000th run
Starting the 15000th run
Starting the 108500th run
Starting the 68500th run
Starting the 28500th run
Starting the 82000th run
Starting the 122000th run
Starting the 42000th run
Starting the 2000th run
S

KeyboardInterrupt: 

Starting the 8500th run
Starting the 48500th run
Starting the 75000th run


In [None]:
df = pd.DataFrame(crash_details)

# filename for the CSV file
filename = 'crash_details.csv'

# Save DataFrame to CSV file, w/out including the index
df.to_csv(filename, index=False)
