# Title
#### June 29, 2019
* Flatiron School (nyc-mhtn-ds-0422019)

In [1]:
from datetime import datetime
import pickle

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

import xgboost as xgb

from private import api_keys
from weather import Weather
from modeler import Modeler

def get_day_time(dt):
    if dt.time() > datetime.strptime('00:00', '%H:%M').time():
        if dt.time() > datetime.strptime('06:00', '%H:%M').time():
            if dt.time() > datetime.strptime('12:00', '%H:%M').time():
                if dt.time() > datetime.strptime('18:00', '%H:%M').time():
                    return 'evening'
                else:
                    return 'afternoon'
            else:
                return 'morning'
        else:
            return 'night'

## Load Collision Data

In [13]:
data_file = '../../Datasets/NYPD_Motor_Vehicle_Collisions-June-2019.csv'
df = pd.read_csv(data_file)

In [16]:
# dates = df['DATE'].unique()
# lat = '40.717743'
# long = '-73.72986'

# rain = Weather(dates, lat, long)
# results = rain.is_rain()
df['DATE'].nunique()
df['NUMBER OF PERSONS KILLED'].sum()

1766.0

In [None]:
pickle.dump( results, open( "rain_results.p", "wb" ) )

In [None]:
results = pickle.load( open( "rain_results.p", "rb" ) )

In [None]:
len(results)

#### df.info()
`RangeIndex: 1520922 entries, 0 to 1520921
Data columns (total 34 columns):
DATE                             1520922 non-null object
TIME                             1520922 non-null object
BOROUGH                          1063875 non-null object
ZIP CODE                         1063695 non-null object
LATITUDE                         1331596 non-null float64
LONGITUDE                        1331596 non-null float64
LOCATION                         1331596 non-null object
ON STREET NAME                   1227003 non-null object
CROSS STREET NAME                1026963 non-null object
OFF STREET NAME                  201764 non-null object
NUMBER OF PERSONS INJURED        1520905 non-null float64
NUMBER OF PERSONS KILLED         1520891 non-null float64
NUMBER OF PEDESTRIANS INJURED    1520922 non-null int64
NUMBER OF PEDESTRIANS KILLED     1520922 non-null int64
NUMBER OF CYCLIST INJURED        1520922 non-null int64
NUMBER OF CYCLIST KILLED         1520922 non-null int64
NUMBER OF MOTORIST INJURED       1520922 non-null int64
NUMBER OF MOTORIST KILLED        1520922 non-null int64
CONTRIBUTING FACTOR VEHICLE 1    1516958 non-null object
CONTRIBUTING FACTOR VEHICLE 2    1318025 non-null object
CONTRIBUTING FACTOR VEHICLE 3    98014 non-null object
CONTRIBUTING FACTOR VEHICLE 4    20377 non-null object
CONTRIBUTING FACTOR VEHICLE 5    5192 non-null object
UNIQUE KEY                       1520922 non-null int64
VEHICLE TYPE CODE 1              1516039 non-null object
VEHICLE TYPE CODE 2              1273660 non-null object
VEHICLE TYPE CODE 3              127385 non-null object
VEHICLE TYPE CODE 4              48217 non-null object
VEHICLE TYPE CODE 5              10305 non-null object
Zip Codes                        1316873 non-null float64
Borough Boundaries               1322794 non-null float64
City Council Districts           1322787 non-null float64
Community Districts              1322792 non-null float64
Police Precincts                 1322783 non-null float64`

## Data Cleaning
* Drop NaNs in target column
* Convert to ints
* Drop suspicious values in the target (Sumchecked against the redundant death columns)
* Drop columns with sparse data
* Flatten time to one column and convert, then drop the redundants

In [None]:
df.dropna(subset=['NUMBER OF PERSONS KILLED'], inplace=True)
df['NUMBER OF PERSONS KILLED'] = df['NUMBER OF PERSONS KILLED'].astype(int)
df = df.loc[df['NUMBER OF PERSONS KILLED'] == (df['NUMBER OF CYCLIST KILLED'] + df['NUMBER OF MOTORIST KILLED'] + df['NUMBER OF PEDESTRIANS KILLED'])]

In [None]:
drop_columns = ['UNIQUE KEY', 'NUMBER OF PERSONS INJURED', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4','VEHICLE TYPE CODE 5',
                'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 
                'CONTRIBUTING FACTOR VEHICLE 5', 'NUMBER OF PEDESTRIANS INJURED', 
                'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 
                'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED', 'Zip Codes']

In [None]:
df.drop(drop_columns, axis=1, inplace=True)

In [None]:
df['TIME_DAY'] = df['DATE'] + ' ' + df['TIME']
df['TIME_DAY'] = pd.to_datetime(df['TIME_DAY'])
df.drop(['DATE', 'TIME'], axis=1, inplace=True)

##  Data Engineering
#### Create Binary Target (FATAL)

In [9]:
df['FATAL'] = False
df.loc[df['NUMBER OF PERSONS KILLED'] > 0, 'FATAL'] = True
df.drop('NUMBER OF PERSONS KILLED', axis=1, inplace=True)
df['FATAL'].value_counts()

False    1519225
True        1697
Name: FATAL, dtype: int64

#### Create days-of-the-week dummies

In [None]:
df['weekday'] = df['TIME_DAY'].dt.dayofweek
day_of_week_dummies = pd.get_dummies(df['weekday'], prefix="day", drop_first=True)
df = pd.concat([df, day_of_week_dummies], axis=1)

#### Create time-of-day dummies

In [None]:
df['time_of_day'] = df['TIME_DAY'].map(lambda x: get_day_time(x) )
time_of_day_dummies = pd.get_dummies(df['time_of_day'], prefix="tod_", drop_first=True)
df = pd.concat([df, time_of_day_dummies], axis=1)

In [None]:
df.drop(['weekday','time_of_day', 'TIME_DAY'], axis=1, inplace=True)

In [None]:
drop_columns = ['Borough Boundaries', 'City Council Districts', 'Community Districts', 'Police Precincts',
               'LATITUDE', 'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME']

df.drop(drop_columns, axis=1, inplace=True)

# Temporary Cleaning
* Removing features just to get the model going

In [None]:
df = df.dropna()

In [None]:
drop_columns = ['ZIP CODE', 'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 
                'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2' ]
df.drop(drop_columns, axis=1, inplace=True)

In [None]:
# df['ZIP CODE'] = df['ZIP CODE'].astype(int)
# df['ZIP CODE'] = df['ZIP CODE'].astype(str)
# zip_dummies = pd.get_dummies(df['ZIP CODE'], prefix="zip", drop_first=True)
# df = pd.concat([df, zip_dummies], axis=1)
# df.drop('ZIP CODE', axis=1, inplace=True)

In [None]:
df['BOROUGH'] = df['BOROUGH'].astype(str)
boro_dummies = pd.get_dummies(df['BOROUGH'], prefix="boro", drop_first=True)
df = pd.concat([df, boro_dummies], axis=1)
df.drop('BOROUGH', axis=1, inplace=True)

In [None]:
pickle.dump( df, open( "save.p", "wb" ) )

## Quick save after converting time

In [None]:
df = pickle.load( open( "save.p", "rb" ) )
df.info()

In [None]:
target = 'FATAL'

fatal_df = df.loc[df[target] == True]
fatal_y = fatal_df[target]
fatal_X = fatal_df.drop(target, axis=1)

non_fatal_df = df.loc[df[target] == False]

max_samples = 500
downsampled = resample(non_fatal_df,
                                replace = False, # sample without replacement
                                n_samples = max_samples, # match minority n
                                random_state = 23) # reproducible results

non_fatal_y = downsampled[target]
non_fatal_X = downsampled.drop(target, axis=1)

fatal_X_train, fatal_X_test, fatal_y_train, fatal_y_test = train_test_split(fatal_X, fatal_y, test_size=0.25, random_state=23)
non_fatal_X_train, non_fatal_X_test, non_fatal_y_train, non_fatal_y_test = train_test_split(non_fatal_X, non_fatal_y, test_size=0.25, random_state=23)

X_train = pd.concat([fatal_X_train,non_fatal_X_train])
X_test = pd.concat([fatal_X_test,non_fatal_X_test])
y_train = pd.concat([fatal_y_train,non_fatal_y_train])
y_test = pd.concat([fatal_y_test,non_fatal_y_test])

features = list(df.columns)
features.pop(features.index(target))

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train), columns=features)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=features)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  

plt.figure(figsize = (10,5))
sns.countplot(y_train, alpha =.80, palette= ['grey','lightgreen'])
plt.title('Fatalities')
plt.ylabel('Collisions')
plt.show()

## Modeling
##### Logistic Regression

In [None]:
results = []

In [None]:
'''LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)'''

logreg = LogisticRegression(C=100, solver='liblinear')
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)
print(accuracy_score(y_test, y_pred_class))

result = {'type':'Logistic Regression','accuracy':accuracy_score(y_test, y_pred_class), 'coefficients':logreg.coef_}
results.append(result)

##### KNN

In [None]:
k_scores=[]
k_range = list(range(9, 22,2))
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_class= knn.predict(X_test)
    k_scores.append(f1_score(y_pred_class, y_test))
    
plt.figure(figsize=(12, 6))  
plt.plot(k_range, k_scores, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('Accuracy score by K Value')  
plt.xlabel('K Value')  
plt.ylabel('Accuracy Score') 
plt.show()

knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('Accuracy:' + str(accuracy_score(y_test, y_pred_class)))
print('F1: ' + str(f1_score(y_test, y_pred_class)))

result = {'type':'KNN','accuracy':accuracy_score(y_test, y_pred_class), 'f1':f1_score(y_test, y_pred_class)}
results.append(result)

 ##### Random Forest

In [None]:
'''RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=23, verbose=0, warm_start=False)'''

f1_scores=[]
n_range = list(range(10,100,10))
for n in n_range:
    rfc = RandomForestClassifier(random_state = 23, n_estimators=n)
    rfc.fit(X_train, y_train)
    rfc_pred = rfc.predict(X_test)
    f1_scores.append(f1_score(y_pred_class, y_test))
    
plt.figure(figsize=(12, 6))  
plt.plot(n_range, f1_scores, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('F1 score by N Value')  
plt.xlabel('N Value')  
plt.ylabel('F1 Score') 
plt.show()

rfc = RandomForestClassifier(random_state = 23, n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

print('Test Accuracy score: ', accuracy_score(y_test, rfc_pred))
print('Test F1 score: ', f1_score(y_test, rfc_pred))

result = {'type':'Random Forest','accuracy':accuracy_score(y_test, y_pred_class), 'f1':f1_score(y_test, y_pred_class)}
results.append(result)

##### Decision Tree

In [None]:
f1_scores=[]
n_range = list(range(1,8))
for n in n_range:
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=n)
    clf.fit(X_train,y_train)
    y_pred_class= clf.predict(X_test)
    f1_scores.append(f1_score(y_pred_class, y_test))
    
plt.figure(figsize=(12, 6))  
plt.plot(n_range, f1_scores, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('Accuracy score by N Value')  
plt.xlabel('N Value')  
plt.ylabel('F1 Score') 
plt.show()

f1_scores=[]
n_range = list(range(1,8))
for n in n_range:
    clf = DecisionTreeClassifier(criterion="gini", max_depth=n)
    clf.fit(X_train,y_train)
    y_pred_class= clf.predict(X_test)
    f1_scores.append(f1_score(y_pred_class, y_test))
    
plt.figure(figsize=(12, 6))  
plt.plot(n_range, f1_scores, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('Accuracy score by N Value')  
plt.xlabel('N Value')  
plt.ylabel('F1 Score') 
plt.show()

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_test))
print('Test F1 score: ', f1_score(y_test, y_pred_test))

result = {'type':'Desision Tree','accuracy':accuracy_score(y_test, y_pred_class), 'f1':f1_score(y_test, y_pred_class)}
results.append(result)

##### Grid Search

In [None]:
'''GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=23, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 200, 300, 400]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)'''

param_grid = { 
    'n_estimators': [100,200,300,400]
}

CV_rfc = GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=23, verbose=0, warm_start=False), param_grid=param_grid)
CV_rfc.fit(X_train,y_train)

In [None]:
print(CV_rfc.best_score_)
#predict on the test set
print(CV_rfc.best_estimator_)

y_pred_test = CV_rfc.best_estimator_.predict(X_test)

# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_test))
print('Test F1 score: ', f1_score(y_test, y_pred_test))

#### XG Boost

In [None]:
'''XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)'''

xg_clf = xgb.XGBClassifier(objective ='binary:logistic', 
                           colsample_bytree = 0.3, 
                           learning_rate = 0.1,
                           max_depth = 2, 
                           alpha = 1, 
                           n_estimators = 100)

xg_clf.fit(X_train,y_train)
y_pred_test = xg_clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_test))
print('Test F1 score: ', f1_score(y_test, y_pred_test))

result = {'type':'XGBoost','accuracy':accuracy_score(y_test, y_pred_class), 'f1':f1_score(y_test, y_pred_class)}
results.append(result)

In [None]:
results