# General cleaning of Terry_Stops.csv

In [1]:
# import necessary libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import scipy.stats as scs
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.api as sms

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.feature_selection import RFE
import seaborn as sns
sns.set_style('darkgrid')

import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
%matplotlib inline

### Information about the dataset
"In Terry v. Ohio (Links to an external site.), a landmark Supreme Court case in 1967-8, the court found that a police officer was not in violation of the "unreasonable search and seizure" clause of the Fourth Amendment, even though he stopped and frisked a couple of suspects only because their behavior was suspicious. Thus was born the notion of "reasonable suspicion", according to which an agent of the police may e.g. temporarily detain a person, even in the absence of clearer evidence that would be required for full-blown arrests etc. Terry Stops are stops made of suspicious drivers."

Build a classifier to predict whether an arrest was made after a Terry Stop, given information about the presence of weapons, the time of day of the call, etc. Note that this is a binary classification problem.

***
## Import dataframe from csv file using pd.read_csv function

In [2]:
#importing cvs file and saving it under a variable
terry_df = pd.read_csv('../data/Terry_Stops.csv')

***
## Review kc_house_data

In [3]:
terry_df.head()

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,-,-1,20140000120677,92317,Arrest,,7500,1984,M,Black or African American,...,11:32:00,-,-,-,SOUTH PCT 1ST W - ROBERT,N,N,South,O,O2
1,-,-1,20150000001463,28806,Field Contact,,5670,1965,M,White,...,07:59:00,-,-,-,,N,N,-,-,-
2,-,-1,20150000001516,29599,Field Contact,,4844,1961,M,White,...,19:12:00,-,-,-,,N,-,-,-,-
3,-,-1,20150000001670,32260,Field Contact,,7539,1963,M,White,...,04:55:00,-,-,-,,N,N,-,-,-
4,-,-1,20150000001739,33155,Field Contact,,6973,1977,M,White,...,00:41:00,-,-,-,,N,N,-,-,-


In [4]:
terry_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44869 entries, 0 to 44868
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Subject Age Group         44869 non-null  object
 1   Subject ID                44869 non-null  int64 
 2   GO / SC Num               44869 non-null  int64 
 3   Terry Stop ID             44869 non-null  int64 
 4   Stop Resolution           44869 non-null  object
 5   Weapon Type               44869 non-null  object
 6   Officer ID                44869 non-null  object
 7   Officer YOB               44869 non-null  int64 
 8   Officer Gender            44869 non-null  object
 9   Officer Race              44869 non-null  object
 10  Subject Perceived Race    44869 non-null  object
 11  Subject Perceived Gender  44869 non-null  object
 12  Reported Date             44869 non-null  object
 13  Reported Time             44869 non-null  object
 14  Initial Call Type     

In [5]:
terry_df.shape

(44869, 23)

In [6]:
terry_df.columns

Index(['Subject Age Group', 'Subject ID', 'GO / SC Num', 'Terry Stop ID',
       'Stop Resolution', 'Weapon Type', 'Officer ID', 'Officer YOB',
       'Officer Gender', 'Officer Race', 'Subject Perceived Race',
       'Subject Perceived Gender', 'Reported Date', 'Reported Time',
       'Initial Call Type', 'Final Call Type', 'Call Type', 'Officer Squad',
       'Arrest Flag', 'Frisk Flag', 'Precinct', 'Sector', 'Beat'],
      dtype='object')

***
##  Creating subset dataframe with columns to be used

In [7]:
terry_df = terry_df[['Subject Age Group', 'Subject ID', 'Stop Resolution',  'Weapon Type', 'Officer ID', 'Officer YOB', 'Officer Gender', 'Officer Race', 'Subject Perceived Race', 'Subject Perceived Gender', 'Reported Date', 'Initial Call Type', 'Call Type', 'Officer Squad', 'Arrest Flag', 'Frisk Flag', 'Precinct', 'Sector', 'Beat' ]]

In [8]:
# renaming columns
terry_df.columns = ['subject_age', 'subject_id', 'stop_resolution',
         'weapon_type', 'officer_id', 'officer_age', 
         'officer_gender', 'officer_race', 'subject_race', 
         'subject_gender', 'date', 'initial_call_type', 'call_type',
         'officer_squad', 'arrest', 'frisk',
         'precinct', 'sector', 'beat' ]

In [9]:
terry_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44869 entries, 0 to 44868
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   subject_age        44869 non-null  object
 1   subject_id         44869 non-null  int64 
 2   stop_resolution    44869 non-null  object
 3   weapon_type        44869 non-null  object
 4   officer_id         44869 non-null  object
 5   officer_age        44869 non-null  int64 
 6   officer_gender     44869 non-null  object
 7   officer_race       44869 non-null  object
 8   subject_race       44869 non-null  object
 9   subject_gender     44869 non-null  object
 10  date               44869 non-null  object
 11  initial_call_type  44869 non-null  object
 12  call_type          44869 non-null  object
 13  officer_squad      44288 non-null  object
 14  arrest             44869 non-null  object
 15  frisk              44869 non-null  object
 16  precinct           44869 non-null  objec

***
## Converting column datatypes

In [10]:
# changing columns into categories and datetime
terry_df = terry_df.astype({'subject_age': 'category', 'officer_gender':'category', 
                'officer_race': 'category', 'subject_race':'category', 
                'subject_gender':'category', 'date':'datetime64','arrest':'category', 
                'frisk':'category','precinct':'category', 'sector':'category',
                'beat': 'category'})

In [11]:
terry_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44869 entries, 0 to 44868
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   subject_age        44869 non-null  category      
 1   subject_id         44869 non-null  int64         
 2   stop_resolution    44869 non-null  object        
 3   weapon_type        44869 non-null  object        
 4   officer_id         44869 non-null  object        
 5   officer_age        44869 non-null  int64         
 6   officer_gender     44869 non-null  category      
 7   officer_race       44869 non-null  category      
 8   subject_race       44869 non-null  category      
 9   subject_gender     44869 non-null  category      
 10  date               44869 non-null  datetime64[ns]
 11  initial_call_type  44869 non-null  object        
 12  call_type          44869 non-null  object        
 13  officer_squad      44288 non-null  object        
 14  arrest

***
## Exploring column value counts and nan values

In [12]:
# the dataset seems to have '-' as place holders
# replacing '-' into Nan
terry_df.replace('-', np.nan, inplace=True)

In [13]:
terry_df.isna().sum()

subject_age           1420
subject_id               0
stop_resolution          0
weapon_type           9700
officer_id              24
officer_age              0
officer_gender           0
officer_race             0
subject_race          1764
subject_gender         266
date                     0
initial_call_type    13039
call_type            13039
officer_squad          581
arrest                   0
frisk                  478
precinct              9733
sector                9931
beat                  9878
dtype: int64

In [14]:
terry_df.subject_age.value_counts(dropna=False)

26 - 35         14917
36 - 45          9464
18 - 25          9084
46 - 55          5771
56 and Above     2278
1 - 17           1935
NaN              1420
Name: subject_age, dtype: int64

In [15]:
terry_df.stop_resolution.value_counts(dropna=False)

Field Contact               17987
Offense Report              15126
Arrest                      10853
Referred for Prosecution      728
Citation / Infraction         175
Name: stop_resolution, dtype: int64

In [16]:
terry_df.weapon_type.value_counts(dropna=False)

None                                 32565
NaN                                   9700
Lethal Cutting Instrument             1482
Knife/Cutting/Stabbing Instrument      498
Handgun                                281
Firearm Other                          100
Blunt Object/Striking Implement         66
Club, Blackjack, Brass Knuckles         49
Firearm                                 34
Mace/Pepper Spray                       20
Other Firearm                           18
Firearm (unk type)                      15
Club                                     9
None/Not Applicable                      7
Rifle                                    7
Taser/Stun Gun                           7
Fire/Incendiary Device                   4
Shotgun                                  3
Automatic Handgun                        2
Brass Knuckles                           1
Blackjack                                1
Name: weapon_type, dtype: int64

In [17]:
# subtract officier YOB from report date column to get officers age
terry_df['officer_age'] = (terry_df.date.dt.year -  terry_df.officer_age)

In [18]:
terry_df[terry_df.officer_age >70].head()

Unnamed: 0,subject_age,subject_id,stop_resolution,weapon_type,officer_id,officer_age,officer_gender,officer_race,subject_race,subject_gender,date,initial_call_type,call_type,officer_squad,arrest,frisk,precinct,sector,beat
1068,,-1,Field Contact,,,119,N,Unknown,,,2019-07-27,,,,N,Y,West,K,K3
1069,,-1,Field Contact,,,119,N,Unknown,,,2019-07-27,,,,N,Y,West,K,K3
1070,,-1,Field Contact,,,119,N,Unknown,,,2019-07-27,,,,N,Y,West,K,K3
1091,,-1,Field Contact,,,119,N,Unknown,White,Male,2019-09-10,,,,N,N,East,E,E3
1201,,-1,Field Contact,,8742.0,120,M,Unknown,,Male,2020-10-17,DOWN - CHECK FOR PERSON DOWN,911.0,TRAINING - FIELD TRAINING SQUAD,N,N,West,K,K3


In [21]:
# dropping rows that show officers over the 100 y.o.
# these rows ahve a lot of nan or unknown values
terry_df = terry_df.drop(terry_df[terry_df.officer_age >70].index)

In [22]:
terry_df.officer_age.describe()

count    44809.000000
mean        34.658685
std          8.462094
min         21.000000
25%         28.000000
50%         33.000000
75%         39.000000
max         70.000000
Name: officer_age, dtype: float64

In [25]:
terry_df.officer_gender.value_counts(dropna=False)

M    39703
F     5106
N        0
Name: officer_gender, dtype: int64

In [26]:
terry_df.officer_gender.unique()

['M', 'F']
Categories (2, object): ['M', 'F']

In [28]:
terry_df.officer_race.value_counts(dropna=False)

White                            34132
Hispanic or Latino                2549
Two or More Races                 2489
Asian                             1854
Black or African American         1793
Not Specified                     1244
Nat Hawaiian/Oth Pac Islander      437
American Indian/Alaska Native      311
Unknown                              0
Name: officer_race, dtype: int64

In [None]:
# 'not specified' and 'unknown' are the same thing
# so i will jsut put the values in one 