# Import Packages

In [95]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn as sk
from scipy import stats
from sklearn import preprocessing
import random
import scipy as sp
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from scipy import stats


%matplotlib inline

# Import data into Dataframe

In [None]:
coldata = pd.read_csv("https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv")

# Dataframe Overview

In [52]:
coldata.columns
coldata.head(5)

Index(['SEVERITYCODE', 'X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO',
       'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE',
       'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'COLLISIONTYPE',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INCDATE',
       'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC',
       'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
       'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC',
       'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

In [53]:
coldata.shape

(194673, 38)

This dataset breaks down severity into two categories: 1 meaning that there was no injury sustained in the accident, just property damage; 2 meaning that there was an injury sustained. This dataset also contains data regarding the location of the accident, description of the severity, amount of persons involved in the accident, amount of vehicles involved in the accident, the type of junction in which the accident occurred, whether the driver was under the influence, weather, road condition, light condition, and whether the driver was speeding.

This model is mainly concerned with the data surrounding accident severity, weather conditions at the time of the accident, road conditions at the time of the accident, and light conditions.

# Data Preprocessing and Cleansing

I will first adjust the severity code in the dataset to 0/1 from 1/2.

In [54]:
severity = coldata['SEVERITYCODE'].values

sevlabel = preprocessing.LabelEncoder()
sevlabel.fit([1,2])
severity = sevlabel.transform(severity)
coldata["SEVERITYCODE"] = severity

Next, I will change the weather conditions to numeric values and condense similar conditions.
 - 0 = Clear
 - 1 = Cloudy
 - 2 = Windy
 - 3 = Foggy
 - 4 = Precipitation

In [55]:
coldata["WEATHER"].replace("Clear", 0, inplace=True)
coldata["WEATHER"].replace("Partly Cloudy", 1, inplace=True)
coldata["WEATHER"].replace("Overcast", 1, inplace=True)
coldata["WEATHER"].replace("Severe Crosswind", 2, inplace=True)
coldata["WEATHER"].replace("Blowing Sand/Dirt", 2, inplace=True)
coldata["WEATHER"].replace("Fog/Smog/Smoke", 3, inplace=True)
coldata["WEATHER"].replace("Sleet/Hail/Freezing Rain", 4, inplace=True)
coldata["WEATHER"].replace("Raining", 4, inplace=True)
coldata["WEATHER"].replace("Snowing", 4, inplace=True)
coldata["WEATHER"].replace("Other", "Unknown", inplace=True)

Next, I will change the light conditions to numeric values and condense similar conditions.
 - 0 = Bright
 - 1 = Dim
 - 2 = Dark

In [56]:
coldata["LIGHTCOND"].replace("Daylight", 0, inplace=True)
coldata["LIGHTCOND"].replace("Dawn", 1, inplace=True)
coldata["LIGHTCOND"].replace("Dusk", 1, inplace=True)
coldata["LIGHTCOND"].replace("Dark - Street Lights On", 1, inplace=True)
coldata["LIGHTCOND"].replace("Dark - Street Lights Off", 2, inplace=True)
coldata["LIGHTCOND"].replace("Dark - No Street Lights", 2, inplace=True)
coldata["LIGHTCOND"].replace("Dark - Unknown Lighting", 2, inplace=True)
coldata["LIGHTCOND"].replace("Other", "Unknown", inplace=True)

Next, I will change the road conditions to numeric values and condense similar conditions.
 - 0 = Dry
 - 1 = Coarse
 - 2 = Slick

In [57]:
coldata["ROADCOND"].replace("Dry", 0, inplace=True)
coldata["ROADCOND"].replace("Snow/Slush", 1, inplace=True)
coldata["ROADCOND"].replace("Sand/Mud/Dirt", 1, inplace=True)
coldata["ROADCOND"].replace("Wet", 2, inplace=True)
coldata["ROADCOND"].replace("Standing Water", 2, inplace=True)
coldata["ROADCOND"].replace("Oil", 2, inplace=True)
coldata["ROADCOND"].replace("Ice", 2, inplace=True)
coldata["ROADCOND"].replace("Other", "Unknown", inplace=True)

Next, I will create a dataframe containing only the considered variables as well as the unique identifier for the incidents.

In [60]:
specificdata=coldata[["INCKEY", "SEVERITYCODE", "WEATHER", "LIGHTCOND", "ROADCOND"]]

In [94]:
specificdata.head(5)

Unnamed: 0,INCKEY,SEVERITYCODE,WEATHER,LIGHTCOND,ROADCOND
0,1307,1,1,0,2
1,52200,0,4,1,2
2,26700,0,1,0,0
3,1144,0,0,0,0
4,17700,1,4,0,2


Finally, I will drop rows with null values.

In [62]:
specificdata.dropna(axis=0, how='any', inplace=True)

unknownweatherdata = specificdata[specificdata['WEATHER'] == 'Unknown'].index
specificdata.drop(unknownweatherdata, inplace = True)

unknownlightdata = specificdata[specificdata['LIGHTCOND'] == 'Unknown'].index
specificdata.drop(unknownlightdata, inplace = True)

unknownroaddata = specificdata[specificdata['ROADCOND'] == 'Unknown'].index
specificdata.drop(unknownroaddata, inplace = True)

specificdata.head
specificdata.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


(169957, 5)

In [63]:
specificdata.head

<bound method NDFrame.head of         INCKEY  SEVERITYCODE WEATHER LIGHTCOND ROADCOND
0         1307             1       1         0        2
1        52200             0       4         1        2
2        26700             0       1         0        0
3         1144             0       0         0        0
4        17700             1       4         0        2
5       320840             0       0         0        0
6        83300             0       4         0        2
7       330897             1       0         0        0
8        63400             0       0         0        0
9        58600             1       0         0        0
10       48900             0       1         0        0
11       38800             0       0         0        0
12        2771             0       4         1        2
13       32800             0       4         2        2
14        1212             1       0         1        0
16       46300             1       1         0        0
17       23000    

# Building Model

First, I will build the feature sets by setting my variables to X and the severity code to Y.

In [67]:
x = specificdata[["WEATHER", "LIGHTCOND", "ROADCOND"]].values
y = specificdata[["SEVERITYCODE"]].values

Next, I will create a train set and a test set, with 30% of the data being set aside for testing.

In [68]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=4)
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)

Train set: (118969, 3) (118969, 1)
Test set: (50988, 3) (50988, 1)


Next, I will create a decision tree classifier using the "entropy" criterion with a max depth of 5. Then, I will complete a prediction using the model and verify accuracy.

In [85]:
from sklearn.metrics import accuracy_score, f1_score, log_loss

decisiontree = DecisionTreeClassifier(criterion="entropy", max_depth=5)
decisiontree.fit(x_train, y_train)

yhdecisiontree = decisiontree.predict(x_test)

print('Decision Tree Classifier accuracy score: ', accuracy_score(yhdecisiontree, y_test))

Decision Tree Classifier accuracy score:  0.6728642033419628


Next, I will visualize the model using a classification report and confusion matrix.

In [89]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(yhdecisiontree, y_test))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80     50988
           1       0.00      0.00      0.00         0

   micro avg       0.67      0.67      0.67     50988
   macro avg       0.50      0.34      0.40     50988
weighted avg       1.00      0.67      0.80     50988



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
