In [223]:
# Base imports
import pandas as pd
import numpy as np
from numpy import mean
import glob

# Visual imports
import matplotlib.pyplot as plt
import seaborn as sns

# Import statements required for Plotly 
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

#SciKit learn library imports
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, log_loss, classification_report)
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RepeatedStratifiedKFold

# Imbalanced import
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# AutoML import
from supervised.automl import AutoML

#IO library (for Decision tree vis.)
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
import re

# suppress all warnings
import warnings
warnings.filterwarnings("ignore")

In [224]:
# data in load
pd.set_option('display.max_columns', None) 

accidents = pd.read_csv('../dataset/Road Safety Data - Accidents 2019.csv')
print('Records:', accidents.shape[0], '\nColumns:', accidents.shape[1])
accidents.head()

Records: 117536 
Columns: 32


Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),1st_Road_Class,1st_Road_Number,Road_Type,Speed_limit,Junction_Detail,Junction_Control,2nd_Road_Class,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
0,2019010128300,528218.0,180407.0,-0.153842,51.508057,1,3,2,3,18/02/2019,2,17:50,1,E09000033,3,4202,1,30,1,2,3,4202,0,5,1,1,1,0,0,1,3,E01004762
1,2019010152270,530219.0,172463.0,-0.127949,51.436208,1,3,2,1,15/01/2019,3,21:45,9,E09000022,3,23,2,30,0,-1,-1,0,-1,-1,4,1,1,0,0,1,3,E01003117
2,2019010155191,530222.0,182543.0,-0.124193,51.526795,1,3,2,1,01/01/2019,3,01:50,2,E09000007,4,504,6,30,3,4,6,0,0,0,4,1,1,0,0,1,1,E01000943
3,2019010155192,525531.0,184605.0,-0.191044,51.546387,1,2,1,1,01/01/2019,3,01:20,2,E09000007,4,510,6,20,3,4,4,510,0,0,4,1,1,0,0,1,1,E01000973
4,2019010155194,524920.0,184004.0,-0.200064,51.541121,1,3,2,2,01/01/2019,3,00:40,28,E09000005,3,4003,6,30,6,4,6,0,0,0,4,1,1,0,0,1,1,E01000546


In [225]:
accidents.columns

Index(['Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
       'Longitude', 'Latitude', 'Police_Force', 'Accident_Severity',
       'Number_of_Vehicles', 'Number_of_Casualties', 'Date', 'Day_of_Week',
       'Time', 'Local_Authority_(District)', 'Local_Authority_(Highway)',
       '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit',
       'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
       '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident',
       'LSOA_of_Accident_Location'],
      dtype='object')

In [226]:
#accidents.info()

In [227]:
accidents['Date']= pd.to_datetime(accidents['Date'])

In [228]:
# slice first and second string from time column
accidents['Hour'] = accidents['Time'].str[0:2]

# convert new column to numeric datetype
accidents['Hour'] = pd.to_numeric(accidents['Hour'])

# drop null values in our new column
accidents = accidents.dropna(subset=['Hour'])

# cast to integer values
accidents['Hour'] = accidents['Hour'].astype('int')

In [229]:
# define a function that turns the hours into daytime groups
def when_was_it(hour):
    if hour >= 5 and hour < 10:
        return "1"
    elif hour >= 10 and hour < 15:
        return "2"
    elif hour >= 15 and hour < 19:
        return "3"
    elif hour >= 19 and hour < 23:
        return "4"
    else:
        return "5"

In [230]:
# create a little dictionary to later look up the groups I created
daytime_groups = {1: 'Morning: Between 5 and 10', 
                  2: 'Office Hours: Between 10 and 15', 
                  3: 'Afternoon Rush: Between 15 and 19', 
                  4: 'Evening: Between 19 and 23', 
                  5: 'Night: Between 23 and 5'}

In [231]:
# apply this function to our temporary hour column
accidents['Daytime'] = accidents['Hour'].apply(when_was_it)
accidents[['Time', 'Hour', 'Daytime']].head()

Unnamed: 0,Time,Hour,Daytime
0,17:50,17,3
1,21:45,21,4
2,01:50,1,5
3,01:20,1,5
4,00:40,0,5


In [232]:
# drop old time column and temporary hour column
accidents = accidents.drop(columns=['Time', 'Hour'])

In [233]:
print('Proportion of Missing Values in Accidents Table:', 
      round(accidents.isna().sum().sum()/len(accidents), 3), '%')

Proportion of Missing Values in Accidents Table: 0.05 %


In [234]:
# Define a dictionary for the target mapping
# Use the pandas apply method to numerically encode our Did_Police_Officer_Attend_Scene_of_Accident target variable
accidents['Did_Police_Officer_Attend_Scene_of_Accident_numerical'] = accidents['Did_Police_Officer_Attend_Scene_of_Accident'].values.reshape(-1,1)

In [235]:
# creating a list of only numerical categories for correlation plotting
    
numerical = [
#            u'Location_Easting_OSGR', 
#            u'Location_Northing_OSGR',
#            u'Longitude',
#            u'Latitude',
            u'Police_Force',
            u'Accident_Severity', 
            u'Number_of_Vehicles',
            u'Number_of_Casualties', 
            u'Day_of_Week', 
            u'Local_Authority_(District)', 
            u'1st_Road_Class',
            u'1st_Road_Number',
            u'Road_Type',
            u'Speed_limit',
            u'Junction_Detail',
            u'Junction_Control',
            u'2nd_Road_Class',
            u'2nd_Road_Number',
            u'Pedestrian_Crossing-Human_Control',
            u'Pedestrian_Crossing-Physical_Facilities',
            u'Light_Conditions',
            u'Weather_Conditions', 
            u'Road_Surface_Conditions',
            u'Special_Conditions_at_Site', 
            u'Carriageway_Hazards',
            u'Urban_or_Rural_Area', 
            u'Did_Police_Officer_Attend_Scene_of_Accident'
]
data = [
    go.Heatmap(
        z=accidents[numerical].astype(float).corr().values, # Generating the Pearson correlation
        x=accidents[numerical].columns.values,
        y=accidents[numerical].columns.values,
        colorscale='Viridis',
        reversescale = False,
#         text = True ,
        opacity = 1.0
        
    )
]

layout = go.Layout(
    title='Pearson Correlation of numerical features',
    xaxis = dict(ticks='', nticks=36),
    yaxis = dict(ticks='' ),
    width = 900, height = 700,
    
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap')

In [236]:
# leaving out highly correlated features from final numerical features before model learning
numerical = [
#            u'Location_Easting_OSGR', 
#            u'Location_Northing_OSGR',
#            u'Longitude',
#            u'Latitude',
#            u'Police_Force',
            u'Accident_Severity', 
            u'Number_of_Vehicles',
            u'Number_of_Casualties', 
            u'Day_of_Week', 
#           u'Local_Authority_(District)', 
##>>            u'1st_Road_Class',
##>>            u'1st_Road_Number',
            u'Road_Type',
#            u'Speed_limit',
#            u'Junction_Detail',
#            u'Junction_Control',
#            u'2nd_Road_Class',
##>>            u'2nd_Road_Number',
            u'Pedestrian_Crossing-Human_Control',
            u'Pedestrian_Crossing-Physical_Facilities',
            u'Light_Conditions',
            u'Weather_Conditions', 
            u'Road_Surface_Conditions',
            u'Special_Conditions_at_Site', 
            u'Carriageway_Hazards',
#            u'Urban_or_Rural_Area', 
            u'Did_Police_Officer_Attend_Scene_of_Accident'
]

#g = sns.pairplot(df[numerical], hue='Did_Police_Officer_Attend_Scene_of_Accident', palette='seismic', diag_kind = 'kde',diag_kws=dict(shade=True))
#g.set(xticklabels=[])

In [237]:
# drop columns we don't need (geographical features as well as highly correlated ones)
#accidents = accidents.drop(columns=['2nd_Road_Class', '2nd_Road_Number',
#                                    'Location_Easting_OSGR', 'Location_Northing_OSGR', 
#                                    'Longitude', 'Latitude', 'LSOA_of_Accident_Location',
#                                    'Pedestrian_Crossing-Human_Control', 
#                                    'Pedestrian_Crossing-Physical_Facilities'])

In [238]:
# dealing with empty values remaining
# drop remaining records with NaN's
accidents = accidents.dropna()

# check if we have no NaN's anymore
accidents.isna().sum().sum()

0

In [239]:
accidents.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111760 entries, 0 to 111851
Data columns (total 33 columns):
 #   Column                                                 Non-Null Count   Dtype         
---  ------                                                 --------------   -----         
 0   Accident_Index                                         111760 non-null  object        
 1   Location_Easting_OSGR                                  111760 non-null  float64       
 2   Location_Northing_OSGR                                 111760 non-null  float64       
 3   Longitude                                              111760 non-null  float64       
 4   Latitude                                               111760 non-null  float64       
 5   Police_Force                                           111760 non-null  int64         
 6   Accident_Severity                                      111760 non-null  int64         
 7   Number_of_Vehicles                                     1

In [240]:
# cast categorical features - currently stored as string data - to their proper data format
for col in ['Accident_Severity', 'Daytime', 'Speed_limit', 'Urban_or_Rural_Area']:
    accidents[col] = accidents[col].astype('category')
    
accidents.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111760 entries, 0 to 111851
Data columns (total 33 columns):
 #   Column                                                 Non-Null Count   Dtype         
---  ------                                                 --------------   -----         
 0   Accident_Index                                         111760 non-null  object        
 1   Location_Easting_OSGR                                  111760 non-null  float64       
 2   Location_Northing_OSGR                                 111760 non-null  float64       
 3   Longitude                                              111760 non-null  float64       
 4   Latitude                                               111760 non-null  float64       
 5   Police_Force                                           111760 non-null  int64         
 6   Accident_Severity                                      111760 non-null  category      
 7   Number_of_Vehicles                                     1

In [241]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = accidents.select_dtypes(include=numerics)
num_cols.columns

Index(['Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude',
       'Latitude', 'Police_Force', 'Number_of_Vehicles',
       'Number_of_Casualties', 'Day_of_Week', 'Local_Authority_(District)',
       '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Junction_Detail',
       'Junction_Control', '2nd_Road_Class', '2nd_Road_Number',
       'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Did_Police_Officer_Attend_Scene_of_Accident',
       'Did_Police_Officer_Attend_Scene_of_Accident_numerical'],
      dtype='object')

In [242]:
# outlier detection using boxplots
#sns.set(style='darkgrid')
#fig, axes = plt.subplots(2,1, figsize=(10,4))

#for ax, col in zip(axes, num_cols):
#    accidents.boxplot(column=col, grid=False, vert=False, ax=ax)
#    plt.tight_layout();

In [243]:
# Empty list to store columns with categorical data
categorical = []
for col, value in accidents.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

In [244]:
categorical # accident index will be dropped, how about Local authority_(Highway)?? <<<

['Accident_Index', 'Local_Authority_(Highway)', 'LSOA_of_Accident_Location']

In [245]:
# dropping 'Accident_Index' and 'LSOA_of_Accident_Location' given index as well as georelation 
categorical = ['Daytime', 'Speed_limit', 'Urban_or_Rural_Area', 'Local_Authority_(Highway)']
categorical

['Daytime', 'Speed_limit', 'Urban_or_Rural_Area', 'Local_Authority_(Highway)']

In [246]:
numerical

['Accident_Severity',
 'Number_of_Vehicles',
 'Number_of_Casualties',
 'Day_of_Week',
 'Road_Type',
 'Pedestrian_Crossing-Human_Control',
 'Pedestrian_Crossing-Physical_Facilities',
 'Light_Conditions',
 'Weather_Conditions',
 'Road_Surface_Conditions',
 'Special_Conditions_at_Site',
 'Carriageway_Hazards',
 'Did_Police_Officer_Attend_Scene_of_Accident']

In [249]:
# Store the categorical data in a dataframe called Did_Police_Officer_Attend_Scene_of_Accident_cat
accidents_cat = accidents[categorical]
# df_cat = df_cat.drop(['Did_Police_Officer_Attend_Scene_of_Accident'], axis=1) # Dropping the target column

In [251]:
# converting categorical indicators to dummy indicator variables
accidents_cat = pd.get_dummies(accidents_cat)
accidents_cat.head(5)

Unnamed: 0,Daytime_1,Daytime_2,Daytime_3,Daytime_4,Daytime_5,Speed_limit_-1,Speed_limit_20,Speed_limit_30,Speed_limit_40,Speed_limit_50,Speed_limit_60,Speed_limit_70,Urban_or_Rural_Area_1,Urban_or_Rural_Area_2,Local_Authority_(Highway)_E06000001,Local_Authority_(Highway)_E06000002,Local_Authority_(Highway)_E06000003,Local_Authority_(Highway)_E06000004,Local_Authority_(Highway)_E06000005,Local_Authority_(Highway)_E06000006,Local_Authority_(Highway)_E06000007,Local_Authority_(Highway)_E06000008,Local_Authority_(Highway)_E06000009,Local_Authority_(Highway)_E06000010,Local_Authority_(Highway)_E06000011,Local_Authority_(Highway)_E06000012,Local_Authority_(Highway)_E06000013,Local_Authority_(Highway)_E06000014,Local_Authority_(Highway)_E06000015,Local_Authority_(Highway)_E06000016,Local_Authority_(Highway)_E06000017,Local_Authority_(Highway)_E06000018,Local_Authority_(Highway)_E06000019,Local_Authority_(Highway)_E06000020,Local_Authority_(Highway)_E06000021,Local_Authority_(Highway)_E06000022,Local_Authority_(Highway)_E06000023,Local_Authority_(Highway)_E06000024,Local_Authority_(Highway)_E06000025,Local_Authority_(Highway)_E06000026,Local_Authority_(Highway)_E06000027,Local_Authority_(Highway)_E06000028,Local_Authority_(Highway)_E06000029,Local_Authority_(Highway)_E06000030,Local_Authority_(Highway)_E06000031,Local_Authority_(Highway)_E06000032,Local_Authority_(Highway)_E06000033,Local_Authority_(Highway)_E06000034,Local_Authority_(Highway)_E06000035,Local_Authority_(Highway)_E06000036,Local_Authority_(Highway)_E06000037,Local_Authority_(Highway)_E06000038,Local_Authority_(Highway)_E06000039,Local_Authority_(Highway)_E06000040,Local_Authority_(Highway)_E06000041,Local_Authority_(Highway)_E06000042,Local_Authority_(Highway)_E06000043,Local_Authority_(Highway)_E06000044,Local_Authority_(Highway)_E06000045,Local_Authority_(Highway)_E06000046,Local_Authority_(Highway)_E06000047,Local_Authority_(Highway)_E06000048,Local_Authority_(Highway)_E06000049,Local_Authority_(Highway)_E06000050,Local_Authority_(Highway)_E06000051,Local_Authority_(Highway)_E06000052,Local_Authority_(Highway)_E06000053,Local_Authority_(Highway)_E06000054,Local_Authority_(Highway)_E06000055,Local_Authority_(Highway)_E06000056,Local_Authority_(Highway)_E08000001,Local_Authority_(Highway)_E08000002,Local_Authority_(Highway)_E08000003,Local_Authority_(Highway)_E08000004,Local_Authority_(Highway)_E08000005,Local_Authority_(Highway)_E08000006,Local_Authority_(Highway)_E08000007,Local_Authority_(Highway)_E08000008,Local_Authority_(Highway)_E08000009,Local_Authority_(Highway)_E08000010,Local_Authority_(Highway)_E08000011,Local_Authority_(Highway)_E08000012,Local_Authority_(Highway)_E08000013,Local_Authority_(Highway)_E08000014,Local_Authority_(Highway)_E08000015,Local_Authority_(Highway)_E08000016,Local_Authority_(Highway)_E08000017,Local_Authority_(Highway)_E08000018,Local_Authority_(Highway)_E08000019,Local_Authority_(Highway)_E08000020,Local_Authority_(Highway)_E08000021,Local_Authority_(Highway)_E08000022,Local_Authority_(Highway)_E08000023,Local_Authority_(Highway)_E08000024,Local_Authority_(Highway)_E08000025,Local_Authority_(Highway)_E08000026,Local_Authority_(Highway)_E08000027,Local_Authority_(Highway)_E08000028,Local_Authority_(Highway)_E08000029,Local_Authority_(Highway)_E08000030,Local_Authority_(Highway)_E08000031,Local_Authority_(Highway)_E08000032,Local_Authority_(Highway)_E08000033,Local_Authority_(Highway)_E08000034,Local_Authority_(Highway)_E08000035,Local_Authority_(Highway)_E08000036,Local_Authority_(Highway)_E09000001,Local_Authority_(Highway)_E09000002,Local_Authority_(Highway)_E09000003,Local_Authority_(Highway)_E09000004,Local_Authority_(Highway)_E09000005,Local_Authority_(Highway)_E09000006,Local_Authority_(Highway)_E09000007,Local_Authority_(Highway)_E09000008,Local_Authority_(Highway)_E09000009,Local_Authority_(Highway)_E09000010,Local_Authority_(Highway)_E09000011,Local_Authority_(Highway)_E09000012,Local_Authority_(Highway)_E09000013,Local_Authority_(Highway)_E09000014,Local_Authority_(Highway)_E09000015,Local_Authority_(Highway)_E09000016,Local_Authority_(Highway)_E09000017,Local_Authority_(Highway)_E09000018,Local_Authority_(Highway)_E09000019,Local_Authority_(Highway)_E09000020,Local_Authority_(Highway)_E09000021,Local_Authority_(Highway)_E09000022,Local_Authority_(Highway)_E09000023,Local_Authority_(Highway)_E09000024,Local_Authority_(Highway)_E09000025,Local_Authority_(Highway)_E09000026,Local_Authority_(Highway)_E09000027,Local_Authority_(Highway)_E09000028,Local_Authority_(Highway)_E09000029,Local_Authority_(Highway)_E09000030,Local_Authority_(Highway)_E09000031,Local_Authority_(Highway)_E09000032,Local_Authority_(Highway)_E09000033,Local_Authority_(Highway)_E10000002,Local_Authority_(Highway)_E10000003,Local_Authority_(Highway)_E10000006,Local_Authority_(Highway)_E10000007,Local_Authority_(Highway)_E10000008,Local_Authority_(Highway)_E10000009,Local_Authority_(Highway)_E10000011,Local_Authority_(Highway)_E10000012,Local_Authority_(Highway)_E10000013,Local_Authority_(Highway)_E10000014,Local_Authority_(Highway)_E10000015,Local_Authority_(Highway)_E10000016,Local_Authority_(Highway)_E10000017,Local_Authority_(Highway)_E10000018,Local_Authority_(Highway)_E10000019,Local_Authority_(Highway)_E10000020,Local_Authority_(Highway)_E10000021,Local_Authority_(Highway)_E10000023,Local_Authority_(Highway)_E10000024,Local_Authority_(Highway)_E10000025,Local_Authority_(Highway)_E10000027,Local_Authority_(Highway)_E10000028,Local_Authority_(Highway)_E10000029,Local_Authority_(Highway)_E10000030,Local_Authority_(Highway)_E10000031,Local_Authority_(Highway)_E10000032,Local_Authority_(Highway)_E10000034,Local_Authority_(Highway)_EHEATHROW,Local_Authority_(Highway)_W06000001,Local_Authority_(Highway)_W06000002,Local_Authority_(Highway)_W06000003,Local_Authority_(Highway)_W06000004,Local_Authority_(Highway)_W06000005,Local_Authority_(Highway)_W06000006,Local_Authority_(Highway)_W06000008,Local_Authority_(Highway)_W06000009,Local_Authority_(Highway)_W06000010,Local_Authority_(Highway)_W06000011,Local_Authority_(Highway)_W06000012,Local_Authority_(Highway)_W06000013,Local_Authority_(Highway)_W06000014,Local_Authority_(Highway)_W06000015,Local_Authority_(Highway)_W06000016,Local_Authority_(Highway)_W06000018,Local_Authority_(Highway)_W06000019,Local_Authority_(Highway)_W06000020,Local_Authority_(Highway)_W06000021,Local_Authority_(Highway)_W06000022,Local_Authority_(Highway)_W06000023,Local_Authority_(Highway)_W06000024
0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [252]:
# Store the numerical features to a dataframe Did_Police_Officer_Attend_Scene_of_Accident_num
accidents_num = accidents[numerical]

In [253]:
# Concat the two dataframes together columnwise
# accidents_final = pd.concat([df_num, df_cat], axis=1)
accidents_model = pd.concat([accidents_num], axis=1)

In [254]:
# removing Accident ids
accidents_model.reset_index(drop=True, inplace=True)

In [255]:
accidents_model

Unnamed: 0,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Day_of_Week,Road_Type,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Did_Police_Officer_Attend_Scene_of_Accident
0,3,2,3,2,1,0,5,1,1,1,0,0,3
1,3,2,1,3,2,-1,-1,4,1,1,0,0,3
2,3,2,1,3,6,0,0,4,1,1,0,0,1
3,2,1,1,3,6,0,0,4,1,1,0,0,1
4,3,2,2,3,6,0,0,4,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
111755,1,3,2,7,6,0,0,1,1,1,0,0,1
111756,1,3,2,1,6,0,0,1,1,1,0,0,1
111757,1,1,1,1,6,0,0,4,2,2,0,0,1
111758,1,1,2,7,6,0,0,1,1,1,0,0,1


In [256]:
# define our features 
features = accidents_model.drop(['Did_Police_Officer_Attend_Scene_of_Accident'], axis=1)

# define our target
target = accidents_model[['Did_Police_Officer_Attend_Scene_of_Accident']]

In [257]:
features.columns

Index(['Accident_Severity', 'Number_of_Vehicles', 'Number_of_Casualties',
       'Day_of_Week', 'Road_Type', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards'],
      dtype='object')

In [258]:
# Observing otentially imbalanced classes
accidents_model['Did_Police_Officer_Attend_Scene_of_Accident'].value_counts(normalize=True)

1    0.691813
2    0.219488
3    0.088699
Name: Did_Police_Officer_Attend_Scene_of_Accident, dtype: float64

In [259]:
# Split data into train and test sets as well as for validation and testing
# X_train, X_test, y_train, y_test
train, test, target_train, target_val = train_test_split(features, 
                                                         target, 
                                                         train_size= 0.80,
                                                         random_state=0);
#train, test, target_train, target_val = StratifiedShuffleSplit(features, target, random_state=0);

In [260]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89408 entries, 1846 to 68268
Data columns (total 12 columns):
 #   Column                                   Non-Null Count  Dtype   
---  ------                                   --------------  -----   
 0   Accident_Severity                        89408 non-null  category
 1   Number_of_Vehicles                       89408 non-null  int64   
 2   Number_of_Casualties                     89408 non-null  int64   
 3   Day_of_Week                              89408 non-null  int64   
 4   Road_Type                                89408 non-null  int64   
 5   Pedestrian_Crossing-Human_Control        89408 non-null  int64   
 6   Pedestrian_Crossing-Physical_Facilities  89408 non-null  int64   
 7   Light_Conditions                         89408 non-null  int64   
 8   Weather_Conditions                       89408 non-null  int64   
 9   Road_Surface_Conditions                  89408 non-null  int64   
 10  Special_Conditions_at_Site     

In [261]:
# AutoML as a starting point
automl = AutoML()
automl.fit(train, target_train)
#automl.fit(smote_train, smote_target)

Linear algorithm was disabled.
AutoML directory: AutoML_11
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble availabe models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.803537 trained in 2.14 seconds
2_DecisionTree logloss 0.697166 trained in 28.08 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost logloss 0.627087 trained in 30.8 seconds
4_Default_NeuralNetwork logloss 0.649007 trained in 39.25 seconds
5_Default_RandomForest logloss 0.680462 trained in 13.81 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.627087 trained in 0.94 seconds
AutoML fit time: 121.53 seconds
AutoML best model: 3_Default_Xgboost


AutoML()

In [262]:
#generating AutoML predictions
predictions = automl.predict(test)

In [263]:
# potential need to deal with imbalanced class
oversampler = SMOTE(random_state=0)
smote_train, smote_target = oversampler.fit_resample(train,target_train)

In [264]:
seed = 0   # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
#   'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0,
#    'criterion': 'entropy', ## using Entropy
    'class_weight':'balanced' # alternative strategy to SMOTE
}

In [265]:
rf = RandomForestClassifier(**rf_params)

In [266]:
rf.fit(train,target_train)
#rf.fit(smote_train,smote_target)
print("Fitting of Random Forest finished")

Fitting of Random Forest finished


In [267]:
rf_predictions = rf.predict(test)
print("Predictions finished")

Predictions finished


In [268]:
print("Accuracy score: {}".format(accuracy_score(target_val, rf_predictions)))
print("="*80)
print(classification_report(target_val, rf_predictions))

Accuracy score: 0.5267984967788117
              precision    recall  f1-score   support

           1       0.84      0.47      0.60     15525
           2       0.28      0.71      0.40      4857
           3       0.83      0.53      0.64      1970

    accuracy                           0.53     22352
   macro avg       0.65      0.57      0.55     22352
weighted avg       0.72      0.53      0.56     22352



In [269]:
# cross-validation with F1 score (more appropriate to imbalanced classes)
cross_val_score(rf, train,target_train, scoring='f1_macro', n_jobs=-1)

array([0.55159095, 0.55122112, 0.55186777, 0.54072174, 0.55219586])

In [270]:
# Scatter plot 
trace = go.Scatter(
    y = rf.feature_importances_,
    x = accidents_model.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = rf.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = accidents_model.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

In [271]:
# (Ensemble learner) Gradient Boosting Parameters
gb_params ={
    'n_estimators': 1500,
    'max_features': 0.9,
    'learning_rate' : 0.25,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'subsample': 1,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0,
}

In [272]:
gb = GradientBoostingClassifier(**gb_params)

In [273]:
# Fit the model to our SMOTEd train and target
#gb.fit(smote_train, smote_target)
gb.fit(train,target_train)

GradientBoostingClassifier(learning_rate=0.25, max_depth=4, max_features='sqrt',
                           min_samples_leaf=2, n_estimators=1500,
                           random_state=0, subsample=1)

In [274]:
# Get our predictions
gb_predictions = gb.predict(test)
print("Predictions have finished")

Predictions have finished


In [275]:
print(accuracy_score(target_val, gb_predictions))
print(classification_report(target_val, gb_predictions))

0.747986757337151
              precision    recall  f1-score   support

           1       0.75      0.98      0.85     15525
           2       0.56      0.09      0.16      4857
           3       0.88      0.54      0.67      1970

    accuracy                           0.75     22352
   macro avg       0.73      0.54      0.56     22352
weighted avg       0.72      0.75      0.68     22352



In [276]:
# cross-validation with F1 score (more appropriate to imbalanced classes)
cross_val_score(gb, train,target_train, scoring='f1_macro', n_jobs=-1)

array([0.54681622, 0.55723821, 0.5495797 , 0.54759759, 0.54627858])

In [277]:
# Scatter plot 
trace = go.Scatter(
    y = gb.feature_importances_,
    x = accidents_model.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = gb.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = accidents_model.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boosting Model Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter')