In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import statements required for Plotly 
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, log_loss, classification_report)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

from supervised.automl import AutoML

from sklearn import tree
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
import re

# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# data in load
pd.set_option('display.max_columns', None) 
df = pd.read_csv(
    "../dataset/Road Safety Data - Accidents 2019.csv",
    skipinitialspace=True,
    index_col=False
)
df.head()

Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),1st_Road_Class,1st_Road_Number,Road_Type,Speed_limit,Junction_Detail,Junction_Control,2nd_Road_Class,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
0,2019010128300,528218.0,180407.0,-0.153842,51.508057,1,3,2,3,18/02/2019,2,17:50,1,E09000033,3,4202,1,30,1,2,3,4202,0,5,1,1,1,0,0,1,3,E01004762
1,2019010152270,530219.0,172463.0,-0.127949,51.436208,1,3,2,1,15/01/2019,3,21:45,9,E09000022,3,23,2,30,0,-1,-1,0,-1,-1,4,1,1,0,0,1,3,E01003117
2,2019010155191,530222.0,182543.0,-0.124193,51.526795,1,3,2,1,01/01/2019,3,01:50,2,E09000007,4,504,6,30,3,4,6,0,0,0,4,1,1,0,0,1,1,E01000943
3,2019010155192,525531.0,184605.0,-0.191044,51.546387,1,2,1,1,01/01/2019,3,01:20,2,E09000007,4,510,6,20,3,4,4,510,0,0,4,1,1,0,0,1,1,E01000973
4,2019010155194,524920.0,184004.0,-0.200064,51.541121,1,3,2,2,01/01/2019,3,00:40,28,E09000005,3,4003,6,30,6,4,6,0,0,0,4,1,1,0,0,1,1,E01000546


In [3]:
df.describe()

Unnamed: 0,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Day_of_Week,Local_Authority_(District),1st_Road_Class,1st_Road_Number,Road_Type,Speed_limit,Junction_Detail,Junction_Control,2nd_Road_Class,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident
count,117508.0,117508.0,117508.0,117508.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0,117536.0
mean,452403.224717,278685.7,-1.249439,52.395427,28.22816,2.772512,1.840976,1.303073,4.111149,326.686522,4.192052,864.461365,5.213364,36.773414,2.299704,1.532245,2.618134,306.326121,-0.005717,0.86234,2.046888,1.653102,1.291945,0.074939,0.044318,1.323569,1.392807
std,95033.024831,150935.6,1.392356,1.359522,24.622519,0.45161,0.708412,0.754369,1.920117,256.494631,1.462743,2223.470576,1.676701,14.066137,2.684351,2.35231,3.210643,1160.683698,0.277764,1.996284,1.731131,1.809202,0.598592,0.663085,0.528075,0.468349,0.638504
min,64084.0,10814.0,-7.525273,49.91776,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0
25%,388018.25,175526.0,-2.180605,51.465934,5.0,3.0,1.0,1.0,2.0,84.0,3.0,0.0,6.0,30.0,0.0,-1.0,-1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
50%,459396.0,218465.0,-1.123331,51.852523,23.0,3.0,2.0,1.0,4.0,305.0,4.0,45.0,6.0,30.0,1.0,2.0,3.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
75%,529373.25,384804.0,-0.135499,53.357497,45.0,3.0,2.0,1.0,6.0,514.0,6.0,603.0,6.0,40.0,3.0,4.0,6.0,0.0,0.0,0.0,4.0,1.0,2.0,0.0,0.0,2.0,2.0
max,655244.0,1167366.0,1.757476,60.388637,98.0,3.0,17.0,52.0,7.0,941.0,6.0,498878.0,9.0,70.0,9.0,4.0,6.0,9899.0,2.0,8.0,7.0,9.0,5.0,7.0,7.0,3.0,3.0


In [4]:
df.columns

Index(['Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
       'Longitude', 'Latitude', 'Police_Force', 'Accident_Severity',
       'Number_of_Vehicles', 'Number_of_Casualties', 'Date', 'Day_of_Week',
       'Time', 'Local_Authority_(District)', 'Local_Authority_(Highway)',
       '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit',
       'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
       '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident',
       'LSOA_of_Accident_Location'],
      dtype='object')

In [5]:
# Checking the missing values in the dataset. 
# Columns count of missing values (0 means no zero values)
df.isnull().sum()

Accident_Index                                    0
Location_Easting_OSGR                            28
Location_Northing_OSGR                           28
Longitude                                        28
Latitude                                         28
Police_Force                                      0
Accident_Severity                                 0
Number_of_Vehicles                                0
Number_of_Casualties                              0
Date                                              0
Day_of_Week                                       0
Time                                             63
Local_Authority_(District)                        0
Local_Authority_(Highway)                         0
1st_Road_Class                                    0
1st_Road_Number                                   0
Road_Type                                         0
Speed_limit                                       0
Junction_Detail                                   0
Junction_Con

In [6]:
#interpolating remaining "time" column NaN values
df = df.interpolate()

In [7]:
# Dropping the records with missing column in data dataframe.
#df = df[~df['LSOA_of_Accident_Location'].isnull()].copy()

# Checking the missing values in the dataset.
#df.isnull().sum()

In [8]:
df.isnull().sum()

Accident_Index                                    0
Location_Easting_OSGR                             0
Location_Northing_OSGR                            0
Longitude                                         0
Latitude                                          0
Police_Force                                      0
Accident_Severity                                 0
Number_of_Vehicles                                0
Number_of_Casualties                              0
Date                                              0
Day_of_Week                                       0
Time                                             63
Local_Authority_(District)                        0
Local_Authority_(Highway)                         0
1st_Road_Class                                    0
1st_Road_Number                                   0
Road_Type                                         0
Speed_limit                                       0
Junction_Detail                                   0
Junction_Con

In [9]:
#get numerical columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_num = df.select_dtypes(include=numerics)
df_num.columns

Index(['Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude',
       'Latitude', 'Police_Force', 'Accident_Severity', 'Number_of_Vehicles',
       'Number_of_Casualties', 'Day_of_Week', 'Local_Authority_(District)',
       '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit',
       'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
       '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident'],
      dtype='object')

In [10]:
## Plotting the KDEplots
#f, axes = plt.subplots(3, 3, figsize=(10, 8), 
#                       sharex=False, sharey=False)

## Defining our colormap scheme
#s = np.linspace(0, 3, 10)
#cmap = sns.cubehelix_palette(start=0.0, light=1, as_cmap=True)

## Generate and plot
#x = df['Local_Authority_(District)'].values
#y = df['Urban_or_Rural_Area'].values
#sns.kdeplot(x, y, cmap=cmap, shade=True, cut=5, ax=axes[0,0])
#axes[0,0].set( title = 'Local Authority vs. Urban/Rural Area')

#f.tight_layout()

In [11]:
# Define a dictionary for the target mapping
# Use the pandas apply method to numerically encode our Did_Police_Officer_Attend_Scene_of_Accident target variable
df['Did_Police_Officer_Attend_Scene_of_Accident_numerical'] = df['Did_Police_Officer_Attend_Scene_of_Accident'].values.reshape(-1,1)

In [12]:
df['Did_Police_Officer_Attend_Scene_of_Accident_numerical']

0         3
1         3
2         1
3         1
4         1
         ..
117531    1
117532    1
117533    2
117534    2
117535    1
Name: Did_Police_Officer_Attend_Scene_of_Accident_numerical, Length: 117536, dtype: int64

In [13]:
# creating a list of only numerical categories for correlation plotting
    
numerical = [
            u'Location_Easting_OSGR', 
            u'Location_Northing_OSGR',
#            u'Longitude',
#            u'Latitude',
            u'Police_Force',
            u'Accident_Severity', 
            u'Number_of_Vehicles',
            u'Number_of_Casualties', 
            u'Day_of_Week', 
            u'Local_Authority_(District)', 
            u'1st_Road_Class',
            u'1st_Road_Number',
            u'Road_Type',
            u'Speed_limit',
            u'Junction_Detail',
            u'Junction_Control',
            u'2nd_Road_Class',
            u'2nd_Road_Number',
            u'Pedestrian_Crossing-Human_Control',
            u'Pedestrian_Crossing-Physical_Facilities',
            u'Light_Conditions',
            u'Weather_Conditions', 
            u'Road_Surface_Conditions',
            u'Special_Conditions_at_Site', 
            u'Carriageway_Hazards',
            u'Urban_or_Rural_Area', 
            u'Did_Police_Officer_Attend_Scene_of_Accident'
]
data = [
    go.Heatmap(
        z= df[numerical].astype(float).corr().values, # Generating the Pearson correlation
        x=df[numerical].columns.values,
        y=df[numerical].columns.values,
        colorscale='Viridis',
        reversescale = False,
#         text = True ,
        opacity = 1.0
        
    )
]

layout = go.Layout(
    title='Pearson Correlation of numerical features',
    xaxis = dict(ticks='', nticks=36),
    yaxis = dict(ticks='' ),
    width = 900, height = 700,
    
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap')

In [14]:
# leaving out highly correlated features from final numerical features before model learning
numerical = [
#            u'Location_Easting_OSGR', 
#            u'Location_Northing_OSGR',
#            u'Longitude',
#            u'Latitude',
#            u'Police_Force',
            u'Accident_Severity', 
            u'Number_of_Vehicles',
            u'Number_of_Casualties', 
            u'Day_of_Week', 
#           u'Local_Authority_(District)', 
            u'1st_Road_Class',
            u'1st_Road_Number',
            u'Road_Type',
#            u'Speed_limit',
#            u'Junction_Detail',
#            u'Junction_Control',
#            u'2nd_Road_Class',
            u'2nd_Road_Number',
            u'Pedestrian_Crossing-Human_Control',
            u'Pedestrian_Crossing-Physical_Facilities',
            u'Light_Conditions',
            u'Weather_Conditions', 
            u'Road_Surface_Conditions',
            u'Special_Conditions_at_Site', 
            u'Carriageway_Hazards',
#            u'Urban_or_Rural_Area', 
            u'Did_Police_Officer_Attend_Scene_of_Accident'
]

#g = sns.pairplot(df[numerical], hue='Did_Police_Officer_Attend_Scene_of_Accident', palette='seismic', diag_kind = 'kde',diag_kws=dict(shade=True))
#g.set(xticklabels=[])

In [15]:
# Drop the Did_Police_Officer_Attend_Scene_of_Accident column from Accident dataset first - Don't want to include that
df = df.drop(['Did_Police_Officer_Attend_Scene_of_Accident_numerical'], axis=1)

# Empty list to store columns with categorical data
categorical = []
for col, value in df.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

# Store the numerical columns in a list numerical
# numerical = df.columns.difference(categorical)

In [16]:
categorical

['Accident_Index',
 'Date',
 'Time',
 'Local_Authority_(Highway)',
 'LSOA_of_Accident_Location']

In [17]:
numerical

['Accident_Severity',
 'Number_of_Vehicles',
 'Number_of_Casualties',
 'Day_of_Week',
 '1st_Road_Class',
 '1st_Road_Number',
 'Road_Type',
 '2nd_Road_Number',
 'Pedestrian_Crossing-Human_Control',
 'Pedestrian_Crossing-Physical_Facilities',
 'Light_Conditions',
 'Weather_Conditions',
 'Road_Surface_Conditions',
 'Special_Conditions_at_Site',
 'Carriageway_Hazards',
 'Did_Police_Officer_Attend_Scene_of_Accident']

In [18]:
# Store the categorical data in a dataframe called Did_Police_Officer_Attend_Scene_of_Accident_cat
df_cat = df[categorical]
# df_cat = df_cat.drop(['Did_Police_Officer_Attend_Scene_of_Accident'], axis=1) # Dropping the target column

In [19]:
# converting categorical indicators to dummy indicator variables
df_cat = pd.get_dummies(df_cat)
#df_cat.head(5)

In [20]:
# Store the numerical features to a dataframe Did_Police_Officer_Attend_Scene_of_Accident_num
df_num = df[numerical]

In [21]:
# Concat the two dataframes together columnwise
# df_final = pd.concat([df_num, df_cat], axis=1)
df_final = pd.concat([df_num], axis=1)

In [22]:
# removing Accident ids
df_final.reset_index(drop=True, inplace=True)

In [23]:
df_final

Unnamed: 0,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Day_of_Week,1st_Road_Class,1st_Road_Number,Road_Type,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Did_Police_Officer_Attend_Scene_of_Accident
0,3,2,3,2,3,4202,1,4202,0,5,1,1,1,0,0,3
1,3,2,1,3,3,23,2,0,-1,-1,4,1,1,0,0,3
2,3,2,1,3,4,504,6,0,0,0,4,1,1,0,0,1
3,2,1,1,3,4,510,6,510,0,0,4,1,1,0,0,1
4,3,2,2,3,3,4003,6,0,0,0,4,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117531,3,1,1,7,4,725,6,0,0,0,1,1,2,0,0,1
117532,3,4,1,5,3,7,6,0,0,0,1,1,2,0,0,1
117533,3,2,1,6,4,723,6,721,0,0,1,1,1,0,0,2
117534,3,1,1,7,6,710,6,723,0,0,1,1,1,0,0,2


In [24]:
# Define a dictionary for the target mapping
target = df['Did_Police_Officer_Attend_Scene_of_Accident']

In [25]:
# Plotting the potential imbalance between the target label range
data = [go.Bar(
            x= df['Did_Police_Officer_Attend_Scene_of_Accident'].value_counts().index.values,
            y= df['Did_Police_Officer_Attend_Scene_of_Accident'].value_counts().values
    )]

py.iplot(data, filename='basic-bar')

In [26]:
df_final

Unnamed: 0,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Day_of_Week,1st_Road_Class,1st_Road_Number,Road_Type,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Did_Police_Officer_Attend_Scene_of_Accident
0,3,2,3,2,3,4202,1,4202,0,5,1,1,1,0,0,3
1,3,2,1,3,3,23,2,0,-1,-1,4,1,1,0,0,3
2,3,2,1,3,4,504,6,0,0,0,4,1,1,0,0,1
3,2,1,1,3,4,510,6,510,0,0,4,1,1,0,0,1
4,3,2,2,3,3,4003,6,0,0,0,4,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117531,3,1,1,7,4,725,6,0,0,0,1,1,2,0,0,1
117532,3,4,1,5,3,7,6,0,0,0,1,1,2,0,0,1
117533,3,2,1,6,4,723,6,721,0,0,1,1,1,0,0,2
117534,3,1,1,7,6,710,6,723,0,0,1,1,1,0,0,2


In [27]:
# feature and target set
features = df_final.drop(['Did_Police_Officer_Attend_Scene_of_Accident'], axis=1)

# define our target
target = df_final[['Did_Police_Officer_Attend_Scene_of_Accident']]

In [28]:
# Import the train_test_split method
# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_val = train_test_split(features, 
                                                         target, 
                                                         train_size= 0.80,
                                                         random_state=0);
#train, test, target_train, target_val = StratifiedShuffleSplit(attrition_final, target, random_state=0);

In [29]:
train.head()

Unnamed: 0,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Day_of_Week,1st_Road_Class,1st_Road_Number,Road_Type,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards
65077,3,2,1,6,4,1188,6,0,0,0,4,1,1,0,0
41534,3,2,1,5,3,629,6,6116,0,0,1,1,1,0,0
49050,3,2,1,2,3,165,3,0,0,0,1,1,2,0,0
66084,2,4,1,6,3,563,6,0,0,0,4,1,1,0,0
107010,3,4,4,5,4,3369,6,0,0,0,1,1,1,0,0


In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94028 entries, 65077 to 68268
Data columns (total 15 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   Accident_Severity                        94028 non-null  int64
 1   Number_of_Vehicles                       94028 non-null  int64
 2   Number_of_Casualties                     94028 non-null  int64
 3   Day_of_Week                              94028 non-null  int64
 4   1st_Road_Class                           94028 non-null  int64
 5   1st_Road_Number                          94028 non-null  int64
 6   Road_Type                                94028 non-null  int64
 7   2nd_Road_Number                          94028 non-null  int64
 8   Pedestrian_Crossing-Human_Control        94028 non-null  int64
 9   Pedestrian_Crossing-Physical_Facilities  94028 non-null  int64
 10  Light_Conditions                         94028 non-null  int64
 11

In [31]:
# potential need to deal with imbalanced class
oversampler = SMOTE(random_state=0)
smote_train, smote_target = oversampler.fit_resample(train,target_train)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.25
)

In [33]:
automl = AutoML()
automl.fit(X_train, y_train)

Linear algorithm was disabled.
AutoML directory: AutoML_7
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble availabe models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.799602 trained in 1.22 seconds
2_DecisionTree logloss 0.692741 trained in 23.93 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost logloss 0.572938 trained in 50.0 seconds
4_Default_NeuralNetwork logloss 0.643351 trained in 55.05 seconds
5_Default_RandomForest logloss 0.673092 trained in 12.29 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.572938 trained in 0.71 seconds
AutoML fit time: 149.77 seconds
AutoML best model: 3_Default_Xgboost


AutoML()

In [34]:
predictions = automl.predict(X_test)