In [28]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np

build a tree to classify severity of accidents based on conditions

- choose feature to be common conditions people face when driving
- try different trees with specific different features and random forest with random different features
- maybe look at combinations of weather/infrastructure things
- what does the difference tell us?

In [16]:
# first import and clean the data
df = pd.read_csv("US_Accidents_March23.csv")

In [17]:
print(df.columns)

Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')


In [18]:
# drop some columns (for weather-based analysis)
to_keep = ["Severity", "Start_Time", "Start_Lng", "Start_Lat", 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight']
weather = df[to_keep]
print(weather.shape)
print(weather.dropna().shape)
# since dropping the null values still leaves 2/3 of the table, it seems like an okay thing to do
weather = weather.dropna()

(7728394, 17)
(5217919, 17)


In [22]:
# feature based analysis (maybe add later 'Street', 'City', 'County', 'State', 'Zipcode', 'Start_Lat', 'Start_Lng')
columns = ['Severity', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop']
feature_data = df[columns]
print(feature_data.dropna().shape)
print(feature_data.shape)
# again dropping all na values seems reasonable since there aren't many of them
feature_data = feature_data.dropna()
# one-hot encoding true/false values
feature_data = pd.get_dummies(feature_data, columns=['Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop'])

(7728394, 14)
(7728394, 14)


In [33]:
# get a random subset of the data to train on
sample = feature_data.sample(n=2000000)
features = sample.columns

# get testing data by dropping
test_options = feature_data.drop(sample.index)
test1 = test_options.sample(n=20000)
test2_options = test_options.drop(test1.index)
test2 = test2_options.sample(n=20000)
test3_options = test2_options.drop(test2.index)
test3 = test3_options.sample(n=20000)
test4_options = test3_options.drop(test3.index)
test4 = test4_options.sample(n=20000)

test1 = test1.to_numpy()
test2 = test2.to_numpy()
test3 = test3.to_numpy()
test4 = test4.to_numpy()
sample = sample.to_numpy()


In [34]:
X_train, y_train = sample[:, 1:], sample[:, 0]
forest = RandomForestRegressor(oob_score=True)
forest.fit(X_train, y_train)
print(f"oob score: {forest.oob_score_}")

# check with testing data
predictions1 = forest.predict(test1[:, 1:])
predictions2 = forest.predict(test2[:, 1:])
predictions3 = forest.predict(test3[:, 1:])
predictions4 = forest.predict(test4[:, 1:])

oob score: 0.022169184153396082


In [35]:
# get the accuracy of the preditions
prediction_labels = [(predictions1, test1[:, 0]), (predictions2, test2[:, 0]), (predictions3, test3[:, 0]), (predictions4, test4[:, 0])]
accuracy = [np.linalg.norm(predictions - y_test, ord=1) / len(test1) for predictions, y_test in prediction_labels]
print(accuracy)

[0.35244474445916923, 0.3510827807272703, 0.3525106711804229, 0.34987961762853775]


Since the accuracy is fairly low, it seems the features used are not good indicators of car accidents.