# Title
#### June 29, 2019
* Flatiron School (nyc-mhtn-ds-0422019)

In [25]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


from private import api_keys
from weather import Weather

In [37]:
class Modeler():
    def __init__(self, dataframe, target):
        self.target = target
        self.df = dataframe
        self.trials = []
        
    def random_forest(self, features, kwargs):
        '''bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=23, verbose=0, warm_start=False'''
        
        trial = {'params':kwargs,
                'features':features}
        
        try:
            X = self.df[features]
            y = self.df[self.target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)
            
            
            rfc = RandomForestClassifier(**kwargs)
            
            rfc.fit(X_train, y_train)
            rfc_pred = rfc.predict(X_test)
            trial['Test Accuracy Score'] = accuracy_score(y_test, rfc_pred)
            trial['Test F1 Score'] = f1_score(y_test, rfc_pred)
            
            # checking accuracy on the test data
            print('Test Accuracy score: ', str(accuracy_score(y_test, rfc_pred)))
            # checking accuracy on the test data
            print('Test F1 score: ', str(f1_score(y_test, rfc_pred)))
            
            self.trials.append(trial)
            
        except Exception as e:
            print(e)
    
    def grid_search(self,features, kwargs):
        '''GridSearchCV(cv=5, error_score='raise-deprecating',
        estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=23, verbose=0, warm_start=False),
        fit_params=None, iid='warn', n_jobs=-1,
        param_grid={'n_estimators': [100, 200, 300, 400]},
        pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
        scoring=None, verbose=0)'''
        
        trial = {'params':kwargs,
                'features':features}
        
        try:
            X = self.df[features]
            y = self.df[self.target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)
            
            CV_rfc = GridSearchCV(**kwargs)
            CV_rfc.fit(X_train, y_train)
            
            print(CV_rfc.best_params_)


            #Identify the best score during fitting with cross-validation
            print(CV_rfc.cv_results_)
            
            CV_rfc_pred = CV_rfc.best_estimator_.predict(X_test)

            trial['Test Accuracy Score'] = accuracy_score(y_test, CV_rfc_pred)
            trial['Test F1 Score'] = f1_score(y_test, CV_rfc_pred)
            
            self.trials.append(trial)
            
            # checking accuracy
            print('Test Accuracy score: ', str(accuracy_score(y_test, CV_rfc_pred)))
            # checking accuracy
            print('Test F1 score: ', str(f1_score(y_test, CV_rfc_pred)))
            
        except Exception as e:
            print(e)
    
    def knn(self, features, kwargs):
        '''KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')'''
        
        trial = {'params':kwargs,
                'features':features}
        
        try:
            X = self.df[features]
            y = self.df[self.target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)
            
            knn = KNeighborsClassifier(**kwargs)
            knn.fit(X_train, y_train)
            knn_y_pred = knn.predict(X_test)
            
            trial['Test Accuracy Score'] = accuracy_score(y_test, knn_y_pred)
            trial['Test F1 Score'] = f1_score(y_test, knn_y_pred)
            
            print('Accuracy:' + str(accuracy_score(y_test, knn_y_pred)))
            print('F1: ' + str(f1_score(y_test, knn_y_pred)))
            
            self.trials.append(trial)
            
        except Exception as e:
            print(e)





#### Collision Data

In [3]:
# This file is saved locally from the Navy site because it is > 300 MB. 
# The Navy site is here: https://data.cityofnewyork.us/resource/qiz3-axqb.json

data_file = '../../Datasets/NYPD_Motor_Vehicle_Collisions.csv'
df = pd.read_csv(data_file)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:

df.isna().sum()


DATE                                   0
TIME                                   0
BOROUGH                           452491
ZIP CODE                          452670
LATITUDE                          188214
LONGITUDE                         188214
LOCATION                          188214
ON STREET NAME                    290761
CROSS STREET NAME                 487294
OFF STREET NAME                  1309317
NUMBER OF PERSONS INJURED             17
NUMBER OF PERSONS KILLED              31
NUMBER OF PEDESTRIANS INJURED          0
NUMBER OF PEDESTRIANS KILLED           0
NUMBER OF CYCLIST INJURED              0
NUMBER OF CYCLIST KILLED               0
NUMBER OF MOTORIST INJURED             0
NUMBER OF MOTORIST KILLED              0
CONTRIBUTING FACTOR VEHICLE 1       3922
CONTRIBUTING FACTOR VEHICLE 2     201012
CONTRIBUTING FACTOR VEHICLE 3    1410759
CONTRIBUTING FACTOR VEHICLE 4    1487724
CONTRIBUTING FACTOR VEHICLE 5    1502781
UNIQUE KEY                             0
VEHICLE TYPE COD

### Convert Data Types
#### Dates
(ATTENTION: the code below will take a few minutes)

In [5]:
df['DATE'] = pd.to_datetime(df['DATE'])
df['TIME'] = pd.to_datetime(df['TIME'])

####  Floats

In [10]:
df['NUMBER OF PERSONS KILLED'].dropna(inplace=True)



In [11]:
df['NUMBER OF PERSONS KILLED'] = df['NUMBER OF PERSONS KILLED'].astype(int)



In [13]:
df['FATAL'] = False
df.loc[df['NUMBER OF PERSONS KILLED'] > 0, 'FATAL'] = True

In [21]:
features = ['ZIP CODE',
       'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME',
       'NUMBER OF PERSONS INJURED', 
       'NUMBER OF PEDESTRIANS INJURED', 
       'NUMBER OF CYCLIST INJURED', 
       'NUMBER OF MOTORIST INJURED', 
       'CONTRIBUTING FACTOR VEHICLE 1', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2' ]

### Saving data to file locally
The section below is commented out on purpose.  But if desired, can be used a breakpoint for saving cleaned data to save time in development.

In [31]:
 
arg_list = {'random_state':23, 'n_estimators':100}

target = 'FATAL'

Traffic = Modeler(dataframe=df, target=target)   

In [28]:
Traffic.random_forest(features=features, kwargs=arg_list)

could not convert string to float: 


In [33]:
arg_list = {'n_neighbors':1}
Traffic.knn(features=features, kwargs=arg_list)

could not convert string to float: 


ValueError: could not convert string to float: 

## References
### Project Write-Up Template
Credit to dair.ai with ❤️
* https://github.com/dair-ai/writing_data_scientists/blob/master/Writing_Primer_for_Data_Scientists.ipynb

## Data Cleaning
### Summarize the shape of collision data from NYC
`df.shape`
* (1507923, 29)
* 29 features (columns)
* 1,507,923 collisions (rows)



`df.info()`
* RangeIndex: 1507923 entries, 0 to 1507922
* Data columns (total 29 columns)
* dtypes: float64(4), int64(7), object(18)
* memory usage: 333.6+ MB

`DATE                             1507923 non-null object
TIME                             1507923 non-null object
BOROUGH                          1055432 non-null object
ZIP CODE                         1055253 non-null object
LATITUDE                         1319709 non-null float64
LONGITUDE                        1319709 non-null float64
LOCATION                         1319709 non-null object
ON STREET NAME                   1217162 non-null object
CROSS STREET NAME                1020629 non-null object
OFF STREET NAME                  198606 non-null object
NUMBER OF PERSONS INJURED        1507906 non-null float64
NUMBER OF PERSONS KILLED         1507892 non-null float64
NUMBER OF PEDESTRIANS INJURED    1507923 non-null int64
NUMBER OF PEDESTRIANS KILLED     1507923 non-null int64
NUMBER OF CYCLIST INJURED        1507923 non-null int64
NUMBER OF CYCLIST KILLED         1507923 non-null int64
NUMBER OF MOTORIST INJURED       1507923 non-null int64
NUMBER OF MOTORIST KILLED        1507923 non-null int64
CONTRIBUTING FACTOR VEHICLE 1    1504001 non-null object
CONTRIBUTING FACTOR VEHICLE 2    1306911 non-null object
CONTRIBUTING FACTOR VEHICLE 3    97164 non-null object
CONTRIBUTING FACTOR VEHICLE 4    20199 non-null object
CONTRIBUTING FACTOR VEHICLE 5    5142 non-null object
UNIQUE KEY                       1507923 non-null int64
VEHICLE TYPE CODE 1              1503118 non-null object
VEHICLE TYPE CODE 2              1263185 non-null object
VEHICLE TYPE CODE 3              126564 non-null object
VEHICLE TYPE CODE 4              48051 non-null object
VEHICLE TYPE CODE 5              10256 non-null object`



### Drop unnecessary columns
* 30334 rows × 29 columns do not have ANY location data