In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, roc_curve, auc
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

In [2]:
terry_df = pd.read_csv('../data/terry_clean.csv')

In [3]:
terry_df.head()

Unnamed: 0,date,subject_id,subject_age,subject_race,subject_gender,stop_resolution,weapon_type,officer_id,officer_age,officer_race,officer_gender,officer_squad,initial_call_type,call_type,arrest,frisk,precinct,sector,beat
0,2015-10-16,-1,26 - 35,Asian,Male,Arrest,,7500,31,Black or African American,M,SOUTH PCT 1ST W - ROBERT,,,0,0.0,South,O,O2
1,2015-03-19,-1,26 - 35,Other,Male,Field Contact,,5670,50,White,M,,,,0,0.0,,,
2,2015-04-01,-1,26 - 35,Other,Male,Field Contact,,7539,52,White,M,,,,0,0.0,,,
3,2015-04-03,-1,26 - 35,Black or African American,Male,Field Contact,,6973,38,White,M,,,,0,0.0,,,
4,2015-04-05,-1,26 - 35,Black or African American,Male,Field Contact,,7402,42,White,M,,,,0,0.0,,,


In [4]:
terry_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44331 entries, 0 to 44330
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               44331 non-null  object 
 1   subject_id         44331 non-null  int64  
 2   subject_age        44331 non-null  object 
 3   subject_race       44331 non-null  object 
 4   subject_gender     44331 non-null  object 
 5   stop_resolution    44331 non-null  object 
 6   weapon_type        34687 non-null  object 
 7   officer_id         44331 non-null  int64  
 8   officer_age        44331 non-null  int64  
 9   officer_race       44331 non-null  object 
 10  officer_gender     44331 non-null  object 
 11  officer_squad      43796 non-null  object 
 12  initial_call_type  31518 non-null  object 
 13  call_type          31518 non-null  object 
 14  arrest             44331 non-null  int64  
 15  frisk              44331 non-null  float64
 16  precinct           347

In [5]:
# changing columns into categories and datetime
terry_df = terry_df.astype({'subject_age': 'category', 'officer_gender':'category', 
                'officer_race': 'category', 'subject_race':'category', 
                'subject_gender':'category', 'date':'datetime64','arrest':'category', 
                'frisk':'category','precinct':'category', 'sector':'category',
                'beat': 'category', 'stop_resolution' : 'category', 'weapon_type' : 'category' })

In [6]:
terry_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44331 entries, 0 to 44330
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               44331 non-null  datetime64[ns]
 1   subject_id         44331 non-null  int64         
 2   subject_age        44331 non-null  category      
 3   subject_race       44331 non-null  category      
 4   subject_gender     44331 non-null  category      
 5   stop_resolution    44331 non-null  category      
 6   weapon_type        34687 non-null  category      
 7   officer_id         44331 non-null  int64         
 8   officer_age        44331 non-null  int64         
 9   officer_race       44331 non-null  category      
 10  officer_gender     44331 non-null  category      
 11  officer_squad      43796 non-null  object        
 12  initial_call_type  31518 non-null  object        
 13  call_type          31518 non-null  object        
 14  arrest

In [7]:
terry_df.isna().sum()

date                     0
subject_id               0
subject_age              0
subject_race             0
subject_gender           0
stop_resolution          0
weapon_type           9644
officer_id               0
officer_age              0
officer_race             0
officer_gender           0
officer_squad          535
initial_call_type    12813
call_type            12813
arrest                   0
frisk                    0
precinct              9590
sector                9784
beat                  9731
dtype: int64

In [8]:
terry_df.describe()

Unnamed: 0,subject_id,officer_id,officer_age
count,44331.0,44331.0,44331.0
mean,2069479000.0,7775.958607,34.633687
std,3947564000.0,810.13628,8.452223
min,-1.0,2768.0,21.0
25%,-1.0,7522.0,28.0
50%,-1.0,7765.0,33.0
75%,-1.0,8442.0,39.0
max,17556820000.0,8804.0,70.0


In [9]:
terry_df.call_type.value_counts()

911                              19711
ONVIEW                            8412
TELEPHONE OTHER, NOT 911          3098
ALARM CALL (NOT POLICE ALARM)      290
TEXT MESSAGE                         6
SCHEDULED EVENT (RECURRING)          1
Name: call_type, dtype: int64

***
## Preprocessing the Data

In [10]:
# dropping unnecessary columns
df = terry_df.drop(['date', 'subject_id', 'officer_id', 'initial_call_type', 'call_type', 'precinct', 'sector', 'beat', 'officer_squad', 'officer_age' ], axis=1, inplace=False)

In [11]:
df.isna().sum()

subject_age           0
subject_race          0
subject_gender        0
stop_resolution       0
weapon_type        9644
officer_race          0
officer_gender        0
arrest                0
frisk                 0
dtype: int64

In [12]:
df.mode()

Unnamed: 0,subject_age,subject_race,subject_gender,stop_resolution,weapon_type,officer_race,officer_gender,arrest,frisk
0,26 - 35,White,Male,Field Contact,,White,M,0,0.0


In [13]:
# fill in weapon_type with mode 
df['weapon_type'] = df['weapon_type'].fillna('None')
df.isna().sum()

subject_age        0
subject_race       0
subject_gender     0
stop_resolution    0
weapon_type        0
officer_race       0
officer_gender     0
arrest             0
frisk              0
dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44331 entries, 0 to 44330
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   subject_age      44331 non-null  category
 1   subject_race     44331 non-null  category
 2   subject_gender   44331 non-null  category
 3   stop_resolution  44331 non-null  category
 4   weapon_type      44331 non-null  category
 5   officer_race     44331 non-null  category
 6   officer_gender   44331 non-null  category
 7   arrest           44331 non-null  category
 8   frisk            44331 non-null  category
dtypes: category(9)
memory usage: 392.1 KB


In [15]:
# one hot encode categorical columns
df = pd.get_dummies(df, columns = ['subject_age', 'subject_race', 'subject_gender', 'stop_resolution', 'weapon_type', 'officer_race', 'officer_gender'])
df.head()

Unnamed: 0,arrest,frisk,subject_age_1 - 17,subject_age_18 - 25,subject_age_26 - 35,subject_age_36 - 45,subject_age_46 - 55,subject_age_56 and Above,subject_race_American Indian or Alaska Native,subject_race_Asian,...,officer_race_American Indian/Alaska Native,officer_race_Asian,officer_race_Black or African American,officer_race_Hispanic or Latino,officer_race_Nat Hawaiian/Oth Pac Islander,officer_race_Not Specified,officer_race_Two or More Races,officer_race_White,officer_gender_F,officer_gender_M
0,0,0.0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
1,0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [16]:
df.isna().sum()

arrest                                                                      0
frisk                                                                       0
subject_age_1 - 17                                                          0
subject_age_18 - 25                                                         0
subject_age_26 - 35                                                         0
subject_age_36 - 45                                                         0
subject_age_46 - 55                                                         0
subject_age_56 and Above                                                    0
subject_race_American Indian or Alaska Native                               0
subject_race_Asian                                                          0
subject_race_Black or African American                                      0
subject_race_Hispanic                                                       0
subject_race_Multi-Racial                                       

***
## Define X and y

In [17]:
X = df.drop(['arrest'], axis=1)
y = df['arrest']

***
## Train-test Split

In [18]:
# splitting data into training-testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [19]:
# normalizing the data 
scaler = StandardScaler()
scaled_train = scaler.fit_transform(X_train)
scaled_test = scaler.transform(X_test)

# converting train data into 
scaled_df_train = pd.DataFrame(scaled_train, columns=X.columns)

scaled_df_train.head()

Unnamed: 0,frisk,subject_age_1 - 17,subject_age_18 - 25,subject_age_26 - 35,subject_age_36 - 45,subject_age_46 - 55,subject_age_56 and Above,subject_race_American Indian or Alaska Native,subject_race_Asian,subject_race_Black or African American,...,officer_race_American Indian/Alaska Native,officer_race_Asian,officer_race_Black or African American,officer_race_Hispanic or Latino,officer_race_Nat Hawaiian/Oth Pac Islander,officer_race_Not Specified,officer_race_Two or More Races,officer_race_White,officer_gender_F,officer_gender_M
0,-0.538593,-0.21021,-0.504714,1.324354,-0.518566,-0.385051,-0.230619,-0.173905,-0.17889,-0.654588,...,-0.082912,-0.208567,-0.204051,4.028268,-0.098425,-0.168507,-0.246053,-1.775491,-0.358422,0.358422
1,-0.538593,-0.21021,-0.504714,-0.755085,1.928395,-0.385051,-0.230619,-0.173905,-0.17889,-0.654588,...,-0.082912,-0.208567,-0.204051,-0.248246,-0.098425,-0.168507,-0.246053,0.563225,-0.358422,0.358422
2,-0.538593,-0.21021,-0.504714,1.324354,-0.518566,-0.385051,-0.230619,-0.173905,-0.17889,-0.654588,...,-0.082912,-0.208567,-0.204051,-0.248246,-0.098425,-0.168507,-0.246053,0.563225,-0.358422,0.358422
3,-0.538593,-0.21021,1.98132,-0.755085,-0.518566,-0.385051,-0.230619,-0.173905,-0.17889,-0.654588,...,-0.082912,-0.208567,-0.204051,-0.248246,-0.098425,-0.168507,-0.246053,0.563225,-0.358422,0.358422
4,-0.538593,-0.21021,-0.504714,-0.755085,-0.518566,-0.385051,4.336158,-0.173905,-0.17889,1.527678,...,-0.082912,-0.208567,-0.204051,-0.248246,-0.098425,-0.168507,-0.246053,0.563225,-0.358422,0.358422


In [20]:
X_train.isna().sum()

frisk                                                                       0
subject_age_1 - 17                                                          0
subject_age_18 - 25                                                         0
subject_age_26 - 35                                                         0
subject_age_36 - 45                                                         0
subject_age_46 - 55                                                         0
subject_age_56 and Above                                                    0
subject_race_American Indian or Alaska Native                               0
subject_race_Asian                                                          0
subject_race_Black or African American                                      0
subject_race_Hispanic                                                       0
subject_race_Multi-Racial                                                   0
subject_race_Native Hawaiian or Other Pacific Islander          

In [21]:
# Using SMOTE to address class imbalance

# check for class imbalance
print(y_train.value_counts()) 

# Fit SMOTE to training data
smote = SMOTE()
X_train, y_train = smote.fit_sample(X_train, y_train) 

# Preview synthetic sample class distribution
print('\n')
print(pd.Series(y_train).value_counts()) 

0    31250
1     1998
Name: arrest, dtype: int64


1    31250
0    31250
Name: arrest, dtype: int64


In [22]:
X_train.isna().sum()

frisk                                                                       2074
subject_age_1 - 17                                                             0
subject_age_18 - 25                                                            0
subject_age_26 - 35                                                            0
subject_age_36 - 45                                                            0
subject_age_46 - 55                                                            0
subject_age_56 and Above                                                       0
subject_race_American Indian or Alaska Native                                  0
subject_race_Asian                                                             0
subject_race_Black or African American                                         0
subject_race_Hispanic                                                          0
subject_race_Multi-Racial                                                      0
subject_race_Native Hawaiian

In [23]:
# why did the smote method create nan values???
# idk im confused. im

*** 
## Modeling Using Ensemble Methods
I will use four different ensemble methods for modeling which include KNeighborsClassifier(), RandomForestClassifier(), AdaBoostClassifier(), and GradientBoostingClassifier()

#### KNN Method

In [24]:
# instantiate classifer
knn_clf = KNeighborsClassifier()

# fit the classifier
knn_clf.fit(X_train, y_train)

# predict on test set
knn_test_preds = knn_clf.predict(X_test)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').