In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [4]:
og = pd.read_csv("US_Accidents_March23.csv", index_col="City")
print(og.shape)
print(og.isna().sum())

(7728394, 45)
ID                             0
Source                         0
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  3402762
End_Lng                  3402762
Distance(mi)                   0
Description                    5
Street                     10869
County                         0
State                          0
Zipcode                     1915
Country                        0
Timezone                    7808
Airport_Code               22635
Weather_Timestamp         120228
Temperature(F)            163853
Wind_Chill(F)            1999019
Humidity(%)               174144
Pressure(in)              140679
Visibility(mi)            177098
Wind_Direction            175206
Wind_Speed(mph)           571233
Precipitation(in)        2203586
Weather_Condition         173459
Amenity                        0
Bump                         

In [8]:
# some initial cleaning
columns_to_use = ['Severity', 'Start_Time', 'Temperature(F)', 'Wind_Chill(F)', 'Weather_Condition', 
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)', 'Humidity(%)', 
       "Start_Lat", "Start_Lng"]
# df = og[columns_to_use]
df = og

In [13]:
Fresno = df.loc["Fresno"]         # getting 2 new dataframes, one for accidents in Fresno and one for St Louis
StLouis = df.loc["St Louis"]

Fresno = Fresno.reset_index(drop=True)
StLouis = StLouis.reset_index(drop=True)

# some cleaning first for Fresno
print(Fresno.shape)
# print(Fresno.isna().sum())

# get rid of duplicates
Fresno = Fresno.drop_duplicates(subset=['Start_Lat', 'Start_Lng', 'Start_Time'])

# fill nan values
missing = Fresno.isna().sum()
Fresno = Fresno.ffill().bfill()    # makes sense to forward and back fill because days that are close to each other will have similar values

# save mostly cleaned data to a smaller csv
Fresno.to_csv("Fresno_accidents.csv.zip", compression={'method': 'zip'})

(17200, 45)


In [14]:
# now similar cleaning for StLouis
print(StLouis.shape)
# print(Fresno.isna().sum())

# get rid of duplicates
StLouis = StLouis.drop_duplicates(subset=['Start_Lat', 'Start_Lng', 'Start_Time'])

# fill nan values
missing = StLouis.isna().sum()
StLouis = StLouis.ffill().bfill()    # makes sense to forward and back fill because days that are close to each other will have similar values

# save mostly cleaned data to a smaller csv
StLouis.to_csv("StLouis_accidents.csv.zip")

(1782, 45)


  StLouis = StLouis.ffill().bfill()    # makes sense to forward and back fill because days that are close to each other will have similar values


### Data Analysis below here

In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor


In [16]:
Fresno = pd.read_csv("Fresno_accidents.csv")
StLouis = pd.read_csv("StLouis_accidents.csv")
print(Fresno.columns)

Index(['Unnamed: 0', 'Severity', 'Start_Time', 'Temperature(F)',
       'Wind_Chill(F)', 'Weather_Condition', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Humidity(%)', 'Start_Lat',
       'Start_Lng'],
      dtype='object')


In [None]:
# weather analysis for Fresno
Fresno = pd.read_csv("Fresno_accidents.csv")

new = Fresno[['Severity', 'Temperature(F)', 'Wind_Chill(F)', 'Weather_Condition', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Humidity(%)', 'Start_Lat', 'Start_Lng']]

Fresno = pd.get_dummies(new, "Weather_Condition")
print(Fresno.shape)

# get train and test data
shuffled = Fresno.sample(frac=1, random_state=21)
groups = np.array_split(shuffled, 3)

X = groups[0].to_numpy()
X_train, y_train = X[:, 1:], X[:, 0]

forest = RandomForestRegressor(oob_score=True)
forest.fit(X_train, y_train)
print(forest.oob_score_)


(16289, 40)


  return bound(*args, **kwds)


0.00826514968679426


In [None]:
# weather analysis for Fresno without Weather_Condition
Fresno = pd.read_csv("Fresno_accidents.csv")

Fresno = Fresno[['Severity', 'Temperature(F)', 'Wind_Chill(F)', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Humidity(%)']]

# Fresno = pd.get_dummies(new, "Weather_Condition")
# print(Fresno.shape)

# get train and test data
shuffled = Fresno.sample(frac=1, random_state=21)
groups = np.array_split(shuffled, 3)

X = groups[0].to_numpy()
X_train, y_train = X[:, 1:], X[:, 0]

forest = RandomForestRegressor(oob_score=True)
forest.fit(X_train, y_train)
print(forest.oob_score_)
print(Fresno.columns[1:])
print(forest.feature_importances_)

  return bound(*args, **kwds)


0.019654710344931825
Index(['Temperature(F)', 'Wind_Chill(F)', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Humidity(%)'],
      dtype='object')
[0.15482003 0.15574655 0.24081907 0.03749221 0.1269818  0.04300838
 0.24113195]


In [44]:
# weather analysis for St Louis without Weather_Condition
StLouis = pd.read_csv("StLouis_accidents.csv")

new = StLouis[['Severity', 'Temperature(F)', 'Wind_Chill(F)', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Humidity(%)']]

StLouis = pd.get_dummies(new, "Weather_Condition")
# print(StLouis.shape)

# get train and test data
shuffled = StLouis.sample(frac=1, random_state=21)
groups = np.array_split(shuffled, 3)

X = groups[0].to_numpy()
X_train, y_train = X[:, 1:], X[:, 0]

forest = RandomForestRegressor(oob_score=True)
forest.fit(X_train, y_train)
print(forest.oob_score_)
print(StLouis.columns[1:])
print(forest.feature_importances_)


  return bound(*args, **kwds)


-0.12229399445460265
Index(['Temperature(F)', 'Wind_Chill(F)', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Humidity(%)'],
      dtype='object')
[0.1512614  0.1250726  0.29309669 0.03099853 0.17322171 0.01649788
 0.20985118]


Again pressure and humidity were the greatest predictors and precipitation and visibility were the least


## Do I know what this is saying? What labels are they predicting and how?