In [6]:
"""
Created on Sun Sep 11 22:40:56 2016

@author: Daniel Montiel
email: dmontiel242@gmail.com

This toy notebooks takes in NYC open data motor vehicle collision data and builds a model on whether the features of collision predict a fatality
"""

import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn import cross_validation
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [7]:
#read in data from specified directory

df =pd.read_csv('NYC_Open_Data_Motor_Vehicle_Collisions_small.csv')

lendf = len(df)

In [8]:
#impute missing data and extract features

for ii in ['CONTRIBUTING FACTOR VEHICLE 1','CONTRIBUTING FACTOR VEHICLE 2','VEHICLE TYPE CODE 1','VEHICLE TYPE CODE 2']:
    print ii
    df[ii].loc[df[ii].isnull()]='NOT LISTED'


CONTRIBUTING FACTOR VEHICLE 1
CONTRIBUTING FACTOR VEHICLE 2
VEHICLE TYPE CODE 1
VEHICLE TYPE CODE 2


In [9]:
df['HOUR']=df['TIME'].apply(lambda x: int(x.split(':')[0]))
df['ZIP CODE'] = df['ZIP CODE'].fillna(0)

In [10]:
#convert factor variables into dummy variables
cfv1 = pd.get_dummies(df['CONTRIBUTING FACTOR VEHICLE 1'], prefix='v1cfv_')
cfv2 = pd.get_dummies(df['CONTRIBUTING FACTOR VEHICLE 2'], prefix='v2cfv_')
vtc1 = pd.get_dummies(df['VEHICLE TYPE CODE 1'], prefix='v1vtc_')
vtc2 = pd.get_dummies(df['VEHICLE TYPE CODE 2'], prefix='v2vtc_')
cleandf = cfv1
cleandf = cleandf.join(cfv2)
cleandf = cleandf.join(vtc1)
cleandf = cleandf.join(vtc2)

In [11]:
#Set variance threshold for feature construction
p = 0.80
pn = 1-p
varthresh = p*(1-p)
varthreshpn = pn*(1-p)

In [12]:
#compare variance of each dummy variable and exclude if not in tolerance
clist = []
for column in cleandf:
    if ((cleandf[column].sum()/lendf < varthresh) and (cleandf[column].sum()/lendf > varthreshpn)):
        clist.append(column)
cleandf = cleandf[clist]

In [13]:
#combine trimmed features to relevant features from main data set
dfclean = df[['HOUR','ZIP CODE']].join(cleandf)


In [14]:
#convert dataset to numeric values and set NaN to 0
for ii in list(dfclean):
    dfclean[ii] = pd.to_numeric(dfclean[ii],errors = 'coerce')
    dfclean[ii].loc[dfclean[ii].isnull()]=0

In [15]:
dfclean.head()

Unnamed: 0,HOUR,ZIP CODE,v1cfv__Driver Inattention/Distraction,v1cfv__Failure to Yield Right-of-Way,v2cfv__Driver Inattention/Distraction,v2cfv__NOT LISTED,v2vtc__UNKNOWN
0,0,11201.0,0.0,0.0,0.0,0.0,0.0
1,0,11207.0,0.0,0.0,0.0,0.0,0.0
2,0,11215.0,0.0,0.0,0.0,0.0,0.0
3,0,11237.0,1.0,0.0,0.0,0.0,0.0
4,0,10033.0,0.0,0.0,0.0,1.0,0.0


In [16]:
#divide data set into test and training set
y = df['NUMBER OF PERSONS KILLED']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(dfclean,y, test_size = 0.9, random_state=20)

In [17]:
#train data with support vector machine model
# clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
#clf = AdaBoostClassifier(n_estimators=200).fit(X_train,y_train)

In [19]:
# assess accuracy of model
y_pred = clf.predict(X_test)
print clf.score(X_test,y_test)

0.998742209007
