In [47]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
%run -i 'functions.py'
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from pprint import pprint

In [8]:
data = pd.read_csv('final_data.csv')

In [9]:
data = drop(data, 'Unnamed: 0')

In [10]:
data.head()

Unnamed: 0,accident_severity,number_of_vehicles,number_of_casualties,day_of_week,time,road_class,road_type,speed_limit,junction_control,crossguard,ped_crossway,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,road_hazard,urban_or_rural_area,did_police_officer_attend_scene_of_accident,coordinates,vehicle_type,trailer,vehicle_activity,in_restricted_lane,vehicle_action,vehicle_loc_post_impact,impact_type,was_vehicle_left_hand_drive?,journey_purpose_of_driver,sex_of_driver,age_driver,age_bin_driver,propulsion,age_of_vehicle,socioeconomic_status,driver_residence,car_status,season
0,minor,2,1,Monday,midnight,A_road,single_lane,20,yield,0,ped_light,1,clear,1,0,0,1,1,"(51.529746, -0.102474)",taxi,0,turn_left,0,none,on_road,front,1,work,1,40,7,hybrid/electric,6,low,city,poor,winter
1,severe,2,1,Monday,late_night,B_road,two_lane,30,traffic_light,0,ped_light,1,rain,0,0,0,1,1,"(51.530179, 0.037828)",car,0,turn_right,0,none,on_road,front,1,Unknown,1,27,6,petrol,1,middle,city,good,winter
2,severe,2,1,Monday,early_morning,C_road,single_lane,30,yield,0,none,1,clear,1,0,0,1,1,"(51.514545, -0.199239)",taxi,0,exiting,0,none,on_road,front,1,work,1,66,10,electric,5,low,city,poor,winter
3,minor,3,1,Monday,early_morning,A_road,one_way,30,yield,0,none,1,clear,1,0,0,1,1,"(51.475091, -0.032886)",taxi,0,driving,0,none,on_road,back,1,work,2,36,7,hybrid/electric,2,low,city,poor,winter
4,minor,2,1,Monday,midnight,Unclassified,single_lane,30,yield,0,cross_walk,1,clear,1,0,0,1,1,"(51.56325, -0.311872)",car,0,turn_right,0,none,on_road,front,1,Unknown,1,24,5,petrol,9,low,city,okay,winter


## Prelude
* `accident_severity` is the target
* Data needs to be prepared differently for different models
    * RandomForestClassifier will take in categorical variables without the use of dummy variables, however, all values will need to be numerical.
* Feature selection needs to be completed
* Need to check if there are any valuable polynomial features

## Preparation of data for RFC
* Use `pd.factorize(data[column])[0] + 1`



In [14]:
# Not sure how to turn this into a function
data['accident_severity'] = factorize(data, 'accident_severity')
data['day_of_week'] = factorize(data, 'day_of_week')
data['time'] = factorize(data, 'time')
data['road_class'] = factorize(data, 'road_class')
data['junction_control'] = factorize(data, 'junction_control')
data['ped_crossway'] = factorize(data, 'ped_crossway')
data['weather_conditions'] = factorize(data, 'weather_conditions')
data['vehicle_type'] = factorize(data, 'vehicle_type')
data['vehicle_activity'] = factorize(data, 'vehicle_activity')
data['vehicle_action'] = factorize(data, 'vehicle_action')
data['vehicle_loc_post_impact'] = factorize(data, 'vehicle_loc_post_impact')
data['impact_type'] = factorize(data, 'impact_type')
data['journey_purpose_of_driver'] = factorize(data, 'journey_purpose_of_driver')
data['propulsion'] = factorize(data, 'propulsion')
data['socioeconomic_status'] = factorize(data, 'socioeconomic_status')
data['driver_residence'] = factorize(data, 'driver_residence')
data['car_status'] = factorize(data, 'car_status')
data['season'] = factorize(data, 'season')
data['road_type'] = factorize(data, 'road_type')

In [15]:
data.head()

Unnamed: 0,accident_severity,number_of_vehicles,number_of_casualties,day_of_week,time,road_class,road_type,speed_limit,junction_control,crossguard,ped_crossway,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,road_hazard,urban_or_rural_area,did_police_officer_attend_scene_of_accident,coordinates,vehicle_type,trailer,vehicle_activity,in_restricted_lane,vehicle_action,vehicle_loc_post_impact,impact_type,was_vehicle_left_hand_drive?,journey_purpose_of_driver,sex_of_driver,age_driver,age_bin_driver,propulsion,age_of_vehicle,socioeconomic_status,driver_residence,car_status,season
0,1,2,1,1,1,1,1,20,1,0,1,1,1,1,0,0,1,1,"(51.529746, -0.102474)",1,0,1,0,1,1,1,1,1,1,40,7,1,6,1,1,1,1
1,2,2,1,1,2,2,2,30,2,0,1,1,2,0,0,0,1,1,"(51.530179, 0.037828)",2,0,2,0,1,1,1,1,2,1,27,6,2,1,2,1,2,1
2,2,2,1,1,3,3,1,30,1,0,2,1,1,1,0,0,1,1,"(51.514545, -0.199239)",1,0,3,0,1,1,1,1,1,1,66,10,3,5,1,1,1,1
3,1,3,1,1,3,1,3,30,1,0,2,1,1,1,0,0,1,1,"(51.475091, -0.032886)",1,0,4,0,1,1,2,1,1,2,36,7,1,2,1,1,1,1
4,1,2,1,1,1,4,1,30,1,0,3,1,1,1,0,0,1,1,"(51.56325, -0.311872)",2,0,2,0,1,1,1,1,2,1,24,5,2,9,1,1,3,1


In [16]:
rf_data = drop(data, ['coordinates', 'age_bin_driver'])

### Baseline RandomForestClassifier Model

In [19]:
# Instantiating RFC
rfc = RandomForestClassifier(random_state=42)

In [22]:
# Going to run my first baseline model using cross validation
# Will use explicit training and test data for future runs
# Baseline model
X = rf_data.drop(columns=['accident_severity'], axis=1)
y = rf_data['accident_severity']
rfc.fit(X,y)
cross_validate(rfc, X, y, cv=3)

{'fit_time': array([0.75496697, 0.74396777, 0.75893712]),
 'score_time': array([0.07007504, 0.07576513, 0.07632899]),
 'test_score': array([0.79940505, 0.80570109, 0.79952045])}

Interesting... Time to look at the model but with TTS

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rfc2 = RandomForestClassifier(random_state=42)
rfc2.fit(X_train, y_train)
test_pred = rfc2.predict(X_test)
accuracy_score(y_test, test_pred)

0.8038480513073508

In [44]:
# Precision scores with different hyperparameters
print('Micro:', precision_score(y_test, test_pred, average='micro'))
print('Macro:', precision_score(y_test, test_pred, average='macro'))
print('Weighted:', precision_score(y_test, test_pred, average='weighted'))

Micro: 0.8038480513073508
Macro: 0.40467088166051274
Weighted: 0.7320446239657274


In [45]:
# Recall scores with different hyperparameters
print('Micro:', recall_score(y_test, test_pred, average='micro'))
print('Macro:', recall_score(y_test, test_pred, average='macro'))
print('Weighted:', recall_score(y_test, test_pred, average='weighted'))

Micro: 0.8038480513073508
Macro: 0.348223482822238
Weighted: 0.8038480513073508


In [46]:
# F1 scores with different hyperparameters
print('Micro:', f1_score(y_test, test_pred, average='micro'))
print('Macro:', f1_score(y_test, test_pred, average='macro'))
print('Weighted:', f1_score(y_test, test_pred, average='weighted'))

Micro: 0.8038480513073508
Macro: 0.3341866892391991
Weighted: 0.7403785721258269


### Looking into RFECV (recursive feature selection)

In [49]:
rfe1 = RFECV(estimator = rfc2, step=1, min_features_to_select = 10, cv=3)
rfe1.fit(X,y)
mod1_feats = pd.DataFrame(rfe1.ranking_)

In [53]:
cols = list(X.columns)
mod1_feats['variable'] = cols
mod1_feats = rename_column(mod1_feats, 0, 'Ranking')
mod1_feats

Unnamed: 0,Ranking,variable
0,1,number_of_vehicles
1,1,number_of_casualties
2,1,day_of_week
3,1,time
4,1,road_class
5,1,road_type
6,1,speed_limit
7,1,junction_control
8,9,crossguard
9,1,ped_crossway
