In [2]:
import pandas as pd
from datetime import datetime
import requests
import re

<h3>1. Filtering out 2017 data</h3>

In [7]:
# Displaying some Crash List Data
df_crash_list = pd.read_csv("../Resources/CrashList.csv",index_col=0)
df_crash_list = df_crash_list[df_crash_list["CrashDate"].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime("%Y")) != "2017"]

<h3>2. Latitude and Longitude of each unique (county and state)</h3>

In [None]:
# Get rid of parentheses and the content in them with regular expression
df_locations_fatalities = df_crash_list[["CountyName", "StateName", "Fatals"]].copy()
df_locations_fatalities["CountyName"] = df_locations_fatalities["CountyName"].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df_locations_fatalities.head()

In [None]:
# Find longitude and latitude of each state and county
df_fatalities_by_loc = df_locations_fatalities.groupby(["CountyName", "StateName"]).count()

# Convert it back to DF
df_fatalities_by_loc = df_fatalities_by_loc.reset_index()


# For each unique location, find longitude and latitude using geoapify
for idx, location in df_fatalities_by_loc.iterrows():

    base_url = "https://api.geoapify.com/v1/geocode/search?"
    
    county = location["CountyName"]
    state = location["StateName"]
    
    params = {
        "text": f"{county}, {state}",
        "apiKey": ""
    }

    try:
        res = requests.get(base_url, params=params)
        data = res.json()

        properties = data["features"][0]["properties"]

        lat = properties["lat"]
        lon = properties["lon"]
        
        df_fatalities_by_loc.at[idx, "lat"] = lat
        df_fatalities_by_loc.at[idx, "long"] = lon
    except Exception as e:
        print("Oh no, we ran into a problem:", e)

df_fatalities_by_loc



In [None]:
#Save as CSV
df_fatalities_by_loc.to_csv("../Resources/County_State_Fatalities_18-19.csv")

<h3>3. Converting Excel Population Data to CSV</h3>

In [3]:
xlsx = pd.read_excel("../Resources/population.xlsx").to_csv("../Resources/population.csv")

In [32]:
#read the CSV file
df = pd.read_csv("../Resources/population.csv", index_col=0)

#Keep everything after the 4th
df = df.iloc[4:,:]

#reset the index
df = df.reset_index(drop = True)

df = df.rename(columns = {'table with row headers in column A and column headers in rows 3 through 4 (leading dots indicate sub-parts)': "County", 
                'Unnamed: 1': "Base Estimate",
                'Unnamed: 2': "2020",
                'Unnamed: 3': "2021",
                'Unnamed: 4': "2022"})

df = df.dropna()

# add a state column
df["State"] = df["County"].apply(lambda x: re.sub(r'\..*\,', '' ,x))

# change the county column to only contain County
df["County"] = df["County"].apply(lambda x : re.findall(r'\.(.*?)\ ',x)[0])

#reaarrage the columns
df = df[["State", "County", "Base Estimate", "2020", "2021", "2022"]]

df.head(10)

Unnamed: 0,State,County,Base Estimate,2020,2021,2022
0,Alabama,Autauga,58802,58902,59210.0,59759.0
1,Alabama,Baldwin,231761,233219,239361.0,246435.0
2,Alabama,Barbour,25224,24960,24539.0,24706.0
3,Alabama,Bibb,22300,22183,22370.0,22005.0
4,Alabama,Blount,59130,59102,59085.0,59512.0
5,Alabama,Bullock,10362,10296,10280.0,10202.0
6,Alabama,Butler,19055,19031,18855.0,18650.0
7,Alabama,Calhoun,116444,116239,115677.0,115788.0
8,Alabama,Chambers,34774,34645,34446.0,34088.0
9,Alabama,Cherokee,24979,24972,25026.0,25302.0


In [33]:
# Processed data to csv
df.to_csv("../Resources/population.csv")