# This notebook will be used for IBM data science course capstone project

In [1]:
import pandas as pd
import numpy as np
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Load Data set

In [2]:
df_raw = pd.read_csv("Data-Collisions.csv")
df_raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,SEVERITYCODE,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,...,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,2,-122.323148,47.70314,1,1307,1307,3502005,Matched,Intersection,37475.0,...,Wet,Daylight,,,,10,Entering at angle,0,0,N
1,1,-122.347294,47.647172,2,52200,52200,2607959,Matched,Block,,...,Wet,Dark - Street Lights On,,6354039.0,,11,From same direction - both going straight - bo...,0,0,N
2,1,-122.33454,47.607871,3,26700,26700,1482393,Matched,Block,,...,Dry,Daylight,,4323031.0,,32,One parked--one moving,0,0,N
3,1,-122.334803,47.604803,4,1144,1144,3503937,Matched,Block,,...,Dry,Daylight,,,,23,From same direction - all others,0,0,N
4,2,-122.306426,47.545739,5,17700,17700,1807429,Matched,Intersection,34387.0,...,Wet,Daylight,,4028032.0,,10,Entering at angle,0,0,N


# Drop labels and rename

In [3]:
#dropping non-useful columns from the data set - keys/identifiers, state assigned codes, casual codes, descriptions, Geometry fields
df = df_raw.drop(labels = ['X','Y','REPORTNO','SEVERITYCODE.1','STATUS','SEVERITYDESC','OBJECTID','INCKEY','COLDETKEY','INTKEY','LOCATION','EXCEPTRSNCODE','EXCEPTRSNDESC','INCDATE','SDOT_COLCODE','SDOT_COLDESC','PEDROWNOTGRNT','SDOTCOLNUM','SPEEDING','ST_COLDESC','SEGLANEKEY','CROSSWALKKEY','HITPARKEDCAR'],axis = 1)

#renaming columns for better understanding
df.rename(columns = {"INATTENTIONIND":"InAttention","UNDERINFL":"Under Influence","ST_COLCODE":"State Collision Code","INCDTTM":"Incident Date Time"},inplace=True)

# Understanding the Dataset

In [4]:
#data understanding
df.describe()

#missing values
df.isnull().sum()

#dropping InAttention column due to high missing values
df.drop(labels = ['InAttention'],axis = 1, inplace = True)
df.shape

(194673, 14)

In [5]:
#convert to DateTime object
df['Incident Date Time']=pd.to_datetime(df['Incident Date Time'])

#drop all the missing values in the dataset
df.dropna(subset=['ADDRTYPE', 'COLLISIONTYPE','JUNCTIONTYPE','WEATHER','ROADCOND','LIGHTCOND','State Collision Code'], axis=0,inplace=True)
df.isnull().sum()

df.shape


(182895, 14)

# Data Pre-processing

In [6]:
df["SEVERITYCODE"].value_counts()

1    126270
2     56625
Name: SEVERITYCODE, dtype: int64

In [7]:
#Convert ADDRTYPE categorical variable
df.groupby(['ADDRTYPE'])['SEVERITYCODE'].value_counts()
df['ADDRTYPE'].replace(to_replace=['Alley','Block','Intersection'], value=[0,1,2],inplace=True)

In [8]:
#Convert COLLISIONTYPE categorical variable
df.groupby(['COLLISIONTYPE'])['SEVERITYCODE'].value_counts()
df['COLLISIONTYPE'].replace(to_replace=['Angles','Cycles','Head On','Left Turn','Other','Parked Car','Pedestrian','Rear Ended','Right Turn','Sideswipe'], value=[0,1,2,3,4,5,6,7,8,9],inplace=True)

In [9]:
#Convert JunctionTYPE categorical variable
df.groupby(['JUNCTIONTYPE'])['SEVERITYCODE'].value_counts()

#Drop the fields where junctiontype is unknown 
df.drop(df[df['JUNCTIONTYPE'] == 'Unknown'].index, inplace = True) 
df['JUNCTIONTYPE'].replace(to_replace=['At Intersection (but not related to intersection)','At Intersection (intersection related)','Driveway Junction','Mid-Block (but intersection related)','Mid-Block (not related to intersection)','Ramp Junction'], value=[0,1,2,3,4,5],inplace=True)

In [10]:
#Clean the Under influnce column with hreplacing N with 0 and Y with 1
df['Under Influence'].replace({'N':0,'Y':1,'0':0,'1':1},inplace=True)

In [11]:
#Convert WEATHER categorical variable
df.groupby(['WEATHER'])['SEVERITYCODE'].value_counts()
df.drop(df[df['WEATHER'] == 'Unknown'].index, inplace = True) 
df['WEATHER'].replace(to_replace=['Blowing Sand/Dirt','Clear','Fog/Smog/Smoke','Other','Overcast','Partly Cloudy','Raining','Severe Crosswind','Sleet/Hail/Freezing Rain','Snowing'], value=[0,1,2,3,4,5,6,7,8,9],inplace=True)

In [12]:
#Convert ROADCOND categorical variable
df.groupby(['ROADCOND'])['SEVERITYCODE'].value_counts()
df.drop(df[df['ROADCOND'] == 'Unknown'].index, inplace = True) 
df['ROADCOND'].replace(to_replace=['Dry','Ice','Oil','Other','Sand/Mud/Dirt','Snow/Slush','Standing Water','Wet'], value=[0,1,2,3,4,5,6,7],inplace=True)

In [13]:
#Convert LIGHTCOND categorical variable
df.groupby(['LIGHTCOND'])['SEVERITYCODE'].value_counts()
df.drop(df[df['LIGHTCOND'] == 'Unknown'].index, inplace = True) 
df['LIGHTCOND'].replace(to_replace=['Dark - No Street Lights','Dark - Street Lights Off','Dark - Street Lights On','Dark - Unknown Lighting','Dawn','Daylight','Dusk','Other'], value=[0,1,2,3,4,5,6,7],inplace=True)


In [None]:
#Type cast state collision code as int
df['State Collision Code'] = df['State Collision Code'].astype('int')
df.head()

#extract day of the week variable from the date time column
df['dayofweek'] = df['Incident Date Time'].dt.dayofweek
df=df.drop(labels=['Incident Date Time'],axis=1)
df.describe()

# Modelling the Classifier

In [15]:
import numpy as np
y = df['SEVERITYCODE'].values
X = df[['ADDRTYPE', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'JUNCTIONTYPE', 'Under Influence', 'WEATHER',
       'ROADCOND', 'LIGHTCOND', 'State Collision Code', 'dayofweek']]

#Normalize data
from sklearn import preprocessing
X= preprocessing.StandardScaler().fit(X).transform(X)

  return self.partial_fit(X, y)
  if __name__ == '__main__':


## Test Train Split

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

## KNN Classifier

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

#find the optimum K for best model accuracy  
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

#training the data with the best model
bestneigh = KNeighborsClassifier(n_neighbors = mean_acc.argmax()+1).fit(X_train,y_train)

The best accuracy was with 0.7181244041944709 with k= 8


## Decision Tree Classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier
#Train Model
accidenttree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
accidenttree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Support Vector Machine

In [None]:
from sklearn import svm
#Train Model
clf = svm.SVC(kernel='rbf')
clf.fit(X_train,y_train)

## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

#Train LR Model
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)

# Model Evaluation and choosing the most accurate model

In [18]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [26]:
#KNN Scores
yhat_KNN = bestneigh.predict(X_test)
print('Jaccard score of KNN is:', jaccard_similarity_score(y_test, yhat_KNN))
print('f1 score of KNN is:', f1_score(y_test, yhat_KNN, average='weighted'))

Jaccard score of KNN is: 0.7181244041944709
f1 score of KNN is: 0.6892815784021054


In [22]:
#Decision Tree Scores
yhat_DT = accidenttree.predict(X_test)
print('Jaccard score of DT is:', jaccard_similarity_score(y_test, yhat_DT))
print('f1 score of DT is:', f1_score(y_test, yhat_DT, average='weighted'))

Jaccard score of DT is: 0.7213119637750238
f1 score of DT is: 0.65167318191351


In [None]:
#SVM scores
yhat_svm = clf.predict(X_test)
print('Jaccard score of SVM is:', jaccard_similarity_score(y_test, yhat_svm))
print('f1 score of SVM is:', f1_score(y_test, yhat_svm, average='weighted'))

In [19]:
#LR scores
yhat_lr = LR.predict(X_test)
yhat_prob = LR.predict_proba(X_test)
print('Jaccard score of LR is:', jaccard_similarity_score(y_test, yhat_lr))
print('F1 score of LR is:', f1_score(y_test, yhat_lr, average='weighted') )
print('log loss score of LR is:', log_loss(y_test, yhat_prob))

Jaccard score of LR is: 0.7244399428026692
F1 score of LR is: 0.6702325556277776
log loss score of LR is: 0.5635299431665228


| Algorithm          | Jaccard | F1-score | LogLoss |
|--------------------|---------|----------|---------|
| KNN                | 0.718       | 0.689        | NA      |
| Decision Tree      | 0.721       | 0.652        | NA      |
| SVM                | NA      | NA        | NA      |
| LogisticRegression | 0.724      | 0.670        | 0.563       |