In [None]:
#Imports

import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,mean_squared_error
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings('ignore')

#data = pd.read_csv('Terry_Stops.csv')
data = pd.read_csv('Terry_Stops.csv', skipinitialspace=True)

# First, examine the data to understand type of data, what elements to remove, how to deal with NaN values, organize, and run baseline model

In [None]:
data.info()

In [None]:
#Get rid of columns that will not be used:   Subject ID, GO /SC Num (not sure what this is), Terry Stop ID
data=data.drop(columns=['Subject ID', 'GO / SC Num', 'Terry Stop ID','Officer Squad', 'Officer ID', 'Beat' , 'Reported Date', 'Reported Time'])

In [None]:
data.head()

In [None]:
#Looking at value counts within each column to check for non-numeral characters etc.
for column in data.columns:
     print("\n" + column)
     print(data[column].value_counts())

In [None]:
#Replacing dashes with 'Unknown'
data = data.replace(['-'],'Unknown')

In [None]:
#Checking that dashes were replaced with 'Unknown'
for column in data.columns:
     print("\n" + column)
     print(data[column].value_counts())

In [None]:
#Removing spaces from column names
data.columns = data.columns.str.replace(' ', '')

In [None]:
# To help identify categorical columns
for col in data.columns:
    print(col,'               ', len(data[col].unique()))

In [None]:
#Categorize Office YOB based on 10 year interval
OfficerYOB = pd.cut(data['OfficerYOB'], [1900,1949,1959,1969,1979,1989,1998], labels=['Oldest', 'SecondOldest', 'ThirdOldest', 'FourthOldest', 'FifthOldest','Youngest'])
OfficerYOB_dummies = pd.get_dummies(OfficerYOB)
data = pd.concat([data, OfficerYOB_dummies], axis=1)

In [None]:
SubAgeGrp_dummies = pd.get_dummies(data['SubjectAgeGroup'], prefix='SubAgeGrp')
ArrFlg_dummies = pd.get_dummies(data['ArrestFlag'], prefix='ArrFlg')
FrkFlg_dummies = pd.get_dummies(data['FriskFlag'], prefix='FrkFlg')
StpRes_dummies = pd.get_dummies(data['StopResolution'], prefix='StpRes')
WpType_dummies = pd.get_dummies(data['WeaponType'], prefix='WpType')
OffGend_dummies = pd.get_dummies(data['OfficerGender'], prefix='OffGend')
OffRace_dummies = pd.get_dummies(data['OfficerRace'], prefix='OffRace')
SubPerRace_dummies = pd.get_dummies(data['SubjectPerceivedRace'], prefix='SubPerRace')
SubPerGend_dummies = pd.get_dummies(data['SubjectPerceivedGender'], prefix='SubPerGend')
CallTyp_dummies = pd.get_dummies(data['CallType'], prefix='CallTyp')
Prect_dummies = pd.get_dummies(data['Precinct'], prefix='Prect')
Sect_dummies = pd.get_dummies(data['Sector'], prefix='Sect')

In [None]:
#removing original columns from dummies creations.  Also decided to drop the initial and final call types columns as they are not a part of my project objective.
data = data.drop(['SubjectAgeGroup','OfficerYOB','StopResolution', 'WeaponType', 'OfficerGender', 'OfficerRace','InitialCallType', 'FinalCallType','SubjectPerceivedRace', 'SubjectPerceivedGender', 'CallType', 'Precinct', 'Sector', 'ArrestFlag', 'FriskFlag'], axis=1)

In [None]:
data = pd.concat([data, StpRes_dummies, WpType_dummies, OffGend_dummies, OffRace_dummies, SubPerRace_dummies, SubPerGend_dummies, CallTyp_dummies, Prect_dummies, Sect_dummies, ArrFlg_dummies, FrkFlg_dummies], axis=1)
data.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
import math

In [None]:
for column in data.columns:
     print("\n" + column)
     print(data[column].value_counts())

In [None]:
#combine Prect_SouthWest and Prect_Southwest columns into one column
data['Prect_SWest'] = data['Prect_SouthWest'] + data['Prect_Southwest']

In [None]:
data = data.drop(['Prect_SouthWest', 'Prect_Southwest'], axis=1)

In [None]:
for column in data.columns:
     print("\n" + column)
     print(data[column].value_counts())

#Baseline Model Selection: Decison Tree

In [None]:
#define dependent/independent variables, thinking it would be interesting to know if there is a relationship with race, age, sex and precincts
X = data[['OffGend_F', 'OffGend_M', 'OffGend_N', 'OffRace_American Indian/Alaska Native', 'OffRace_Asian', 'OffRace_Black or African American', 'OffRace_Hispanic or Latino', 'OffRace_Nat Hawaiian/Oth Pac Islander', 'OffRace_Not Specified', 'OffRace_Two or More Races', 'OffRace_Unknown', 'OffRace_White', 'SubPerRace_American Indian or Alaska Native', 'SubPerRace_Asian', 'SubPerRace_Black or African American', 'SubPerRace_Hispanic', 'SubPerRace_Multi-Racial', 'SubPerRace_Native Hawaiian or Other Pacific Islander', 'SubPerRace_Other', 'SubPerRace_Unknown', 'SubPerRace_White', 'SubPerGend_Female', 'SubPerGend_Gender Diverse (gender non-conforming and/or transgender)', 'SubPerGend_Male', 'SubPerGend_Unable to Determine', 'SubPerGend_Unknown', 'Oldest', 'SecondOldest', 'ThirdOldest', 'FourthOldest', 'FifthOldest', 'Youngest', 'Prect_East', 'Prect_FK ERROR', 'Prect_North', 'Prect_OOJ', 'Prect_South', 'Prect_SWest', 'Prect_Unknown', 'Prect_West']] 
y = data.ArrFlg_Y

In [None]:
#splitting the data into training & testing set
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

In [None]:
#instantiate a regression instance
regTree = DecisionTreeRegressor(max_depth = 5)

# fit the tree
regTree.fit(X_train, y_train)

In [None]:
dot_data = StringIO()
export_graphviz(regTree, out_file=dot_data, special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

In [None]:
#use the model to predict the testing and compare mse
regPred = regTree.predict(X_test)

In [None]:
#comparing the results and plot them
mse = mean_squared_error(regPred, y_test)
print(mse)
rmse = math.sqrt(mse)
print(rmse)

In [None]:
# Create the classifier, fit it on the training data and make predictions on the test set
clf = DecisionTreeClassifier(criterion='entropy')

clf.fit(X_train, y_train)

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize = (3,3), dpi=300)
tree.plot_tree(clf,
               feature_names = data.columns, 
               class_names=np.unique(y).astype('str'),
               filled = True)
plt.show()

In [None]:
dt_clf = DecisionTreeClassifier()

dt_cv_score = cross_val_score(dt_clf, X_train, y_train, cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)

print(f"Mean Cross Validation Score: {mean_dt_cv_score :.2%}")

In [None]:
#Tree Ensembles and Random Forests
#3 predictors are Precinct, Subject Race, Officer Race
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [None]:
#selecting only the predictors to use for Tree Ensembles and Random Forests
X = data[[ 'OffRace_American Indian/Alaska Native', 'OffRace_Asian', 'OffRace_Black or African American', 'OffRace_Hispanic or Latino', 'OffRace_Nat Hawaiian/Oth Pac Islander', 'OffRace_Not Specified', 'OffRace_Two or More Races', 'OffRace_Unknown', 'OffRace_White', 'SubPerRace_American Indian or Alaska Native', 'SubPerRace_Asian', 'SubPerRace_Black or African American', 'SubPerRace_Hispanic', 'SubPerRace_Multi-Racial', 'SubPerRace_Native Hawaiian or Other Pacific Islander', 'SubPerRace_Other', 'SubPerRace_Unknown', 'SubPerRace_White', 'Prect_East', 'Prect_FK ERROR', 'Prect_North', 'Prect_OOJ', 'Prect_South', 'Prect_SWest', 'Prect_Unknown', 'Prect_West']] 
y = data.ArrFlg_Y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  
                                                                    test_size = 0.25, random_state=123)

In [None]:
# Instantiate and fit a DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=5) 
tree_clf.fit(X_train, y_train)

In [None]:
#remove columns that will not be used in Tree Ensembles and Random Forests
data=data.drop(['OffGend_F', 'OffGend_M', 'OffGend_N', 'SubPerGend_Female', 'SubPerGend_Gender Diverse (gender non-conforming and/or transgender)', 'SubPerGend_Male', 'SubPerGend_Unable to Determine', 'SubPerGend_Unknown', 'Oldest', 'SecondOldest', 'ThirdOldest', 'FourthOldest', 'FifthOldest', 'Youngest'], axis=1)

In [None]:
#First, we'll need to store our 'Target' column in a separate variable and drop it from the dataset.
#Do this in the cell below
# Split the outcome and predictor variables
target = data['ArrFlg_Y']
data = data.drop("ArrFlg_Y", axis=1)

In [None]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [None]:
# Feature importance
tree_clf.feature_importances_

In [None]:
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

plot_feature_importances(tree_clf)

In [None]:
#Model Performance

In [None]:
#Test Set Predictions
pred = tree_clf.predict(X_test)

#Confusion matrix and classification report
print(confusion_matrix(y_test, pred))
print (classification_report(y_test, pred))

In [None]:
print("Testing Accuracy for Decision Tree Classifier: {:.4}%".format(accuracy_score(y_test, pred) * 100))

In [None]:
#Bagged Trees

In [None]:
# Instantiate a BaggingClassifier
bagged_tree =  BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5), 
                                 n_estimators=20)

In [None]:
# Fit to the training data
bagged_tree.fit(X_train, y_train)

In [None]:
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=5,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=20, n_jobs=None,
                  oob_score=False, random_state=None, verbose=0,
                  warm_start=False)

In [None]:
# Training accuracy score
bagged_tree.score(X_train, y_train)

In [None]:
# Test accuracy score
bagged_tree.score(X_test, y_test)

In [None]:
# Instantiate and fit a RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest.fit(X_train, y_train)

In [None]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# Training accuracy score
forest.score(X_train, y_train)

In [None]:
# Test accuracy score
forest.score(X_test, y_test)

In [None]:
plot_feature_importances(forest)

In [None]:
# Instantiate and fit a RandomForestClassifier
forest_2 = RandomForestClassifier(n_estimators = 5, max_features= 10, max_depth= 2)
forest_2.fit(X_train, y_train)

In [None]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features=10, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# First tree from forest_2
rf_tree_1 = forest_2.estimators_[0]

In [None]:
# Feature importance
plot_feature_importances(rf_tree_1)

In [None]:
# Second tree from forest_2
rf_tree_2 = forest_2.estimators_[1]

In [None]:
# Feature importance
plot_feature_importances(rf_tree_2)

In [None]:
#Grid Search

In [None]:
dt_clf = DecisionTreeClassifier()

In [None]:
dt_cv_score = cross_val_score(dt_clf, X_train, y_train, cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)
print(f"Mean Cross Validation Score: {mean_dt_cv_score: .2%}")

In [None]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6]}


In [None]:
num_decision_trees = 3*2*6*3*6
print(f"Grid Search will have to search through {num_decision_trees} different permutations")

In [None]:
#Import library needed for Grid Search
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

#Instantiate Grid Search
dt_grid_search = GridSearchCV(dt_clf, dt_param_grid, cv=3, return_train_score=True)

In [None]:
dt_grid_search.fit(X_train, y_train)

In [None]:
GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 2, 3, 4, 5, 6],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                         'min_samples_split': [2, 5, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [None]:
#Mean training score
dt_gs_training_score = np.mean(dt_grid_search.cv_results_['mean_train_score'])

In [None]:
#Mean test score
dt_gs_testing_score = dt_grid_search.score(X_test, y_test)

In [None]:
print(f"Mean Training Score: {dt_gs_training_score :.2%}")
print(f"Mean Test Score: {dt_gs_testing_score :.2%}")
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search.best_params_