In [26]:

import requests
from pprint import pprint
import pandas as pd
from datetime import datetime

In [27]:
#Base URL for API call
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/crashes/"

fatal_cases = []

# writing a for loop for all 50 states and from 2018 - 2021 so it doesn't exceed API call limit
for state in range(50):
    for y_since_2018 in range(4):
        response  = requests.get(base_url + f"GetCaseList?states={state}&fromYear={2018 + y_since_2018}&toYear={2018 + y_since_2018}&minNumOfVehicles=0&maxNumOfVehicles=10&format=json")
        data = response.json()
        fatal_cases += data["Results"][0]

In [59]:
df = pd.DataFrame(fatal_cases)
df.head()


Unnamed: 0,CountyName,CrashDate,Fatals,Peds,Persons,St_Case,State,StateName,TotalVehicles
0,TALLADEGA (121),/Date(1515150000000-0500)/,1,0,1,10001,1,Alabama,2
1,WALKER (127),/Date(1515390480000-0500)/,2,0,2,10002,1,Alabama,1
2,CHILTON (21),/Date(1515469800000-0500)/,1,0,2,10003,1,Alabama,2
3,BALDWIN (3),/Date(1515520920000-0500)/,1,0,2,10004,1,Alabama,1
4,JEFFERSON (73),/Date(1516363740000-0500)/,1,0,2,10005,1,Alabama,2


In [63]:
#filter out the problematic dates
df_filtered = df[df["CrashDate"] != "/Date(-62135578800000-0500)/"]
df_filtered.loc[:, "CrashDate"] = df_filtered["CrashDate"].apply(lambda x: datetime.fromtimestamp(int(x[6:19])/1000))

df_filtered = df_filtered.sort_values(by = "CrashDate")

df_filtered = df_filtered.reset_index(drop=True)

df_filtered.head()



Unnamed: 0,CountyName,CrashDate,Fatals,Peds,Persons,St_Case,State,StateName,TotalVehicles
0,RIVERSIDE (65),2017-12-31 23:01:00,1,0,4,63463,6,California,2
1,LAKE (69),2017-12-31 23:15:00,1,0,1,120156,12,Florida,1
2,DOUGLAS (55),2017-12-31 23:20:00,1,0,2,310001,31,Nebraska,2
3,GREGG (183),2017-12-31 23:22:00,1,0,1,480010,48,Texas,1
4,DALLAS (113),2017-12-31 23:30:00,2,0,2,480006,48,Texas,2


In [68]:
import concurrent.futures

people_involved = []

def worker(lower, upper):
    for idx in range(lower, upper):
        row = df_filtered.iloc[idx, :]
        state_case = row["St_Case"]
        state_case_year = row["CrashDate"].strftime('%Y')
        state_number = row["State"]

        # Information during the long list of API calls
        if idx % 500 == 0:
            print(f"Starting the {idx}th run")

        try:
            response = requests.get(base_url + f"GetCaseDetails?stateCase={state_case}&caseYear={state_case_year}&state={state_number}&format=json")
            data = response.json()

            #Get the list of people who are involved in the crash
            persons_data = data["Results"][0][0]["CrashResultSet"]['Vehicles'][0]['Persons']

            for person in persons_data:
                st_case = person['ST_CASE']
                age = person["AGE"]
                sex = person["SEX"]
                race = person['RACENAME']
                seat_pos = person["SEAT_POSNAME"]
                air_bag_deployed = False if person["AIR_BAGNAME"] == "Not Deployed" else True
                fatal = False if person["DEATH_DA"] == "88" else True
                isDrug = person['DRUGSNAME']
                isDrunk = person['DRINKINGNAME']
                BAC = person["ALC_RESNAME"]
                case_date = person['CaseYear'] + '-' + person["MONTH"].zfill(2) + '-' + person["DAY"].zfill(2)
                make_name = person['MAKENAME']
                mod_name = person['MAK_MODNAME']
                mod_year = person['MOD_YEAR']
                state = row["StateName"]
                county = person['COUNTYNAME']

                people_involved.append(
                    {
                        "St Case Num": st_case,
                        "Age": age,
                        "Sex": sex,
                        "Race": race,
                        "Seat Pos": seat_pos,
                        "AirBag Deployed": air_bag_deployed,
                        "Is Fatal": fatal,
                        "Is Drug": isDrug,
                        "Is Drunk": isDrunk,
                        "BAC": BAC,
                        "Case Date": case_date,
                        "Make Name": make_name,
                        "Model Name": mod_name,
                        "Model Year": mod_year,
                        "State": state,
                        "County": county
                    }
                )
        except Exception as e:
            print(idx)
            print(row)
            print("Cannot retrieve data. The following error occurred", e)

# Number of workers (threads) to run in parallel
num_workers = 10

# Calculate the range for each worker
total_rows = len(df_filtered)
chunk_size = total_rows // num_workers
ranges = [(i * chunk_size, (i + 1) * chunk_size) for i in range(num_workers)]
# Adjust the last worker's upper range to cover remaining rows
ranges[-1] = (ranges[-1][0], total_rows)

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Submit jobs to the executor
    futures = [executor.submit(worker, lower, upper) for lower, upper in ranges]

    # Wait for all futures to complete
    concurrent.futures.wait(futures)

# At this point, all workers have finished and populated the people_involved list

Starting the 0th run
Starting the 53500th run
Starting the 107000th run
Starting the 13500th run
Starting the 67000th run
Starting the 120500th run
Starting the 27000th run
Starting the 80500th run
Starting the 40500th run
Starting the 94000th run
Starting the 500th run
Starting the 54000th run
