# Mod_5_Project

## Classify whether a traffic stop will end in an arrest or not

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from datetime import datetime

In [None]:
# Display all expressions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

In [2]:
df = pd.read_csv('terry-stops.csv')

In [None]:
# Adjust jupyter's display limits
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 200)

In [3]:
df.shape

(26042, 23)

In [4]:
# Call Types, Squad, Precinct, Sector, Beat to be dropped
df.isna().sum()

Subject Age Group              0
Subject ID                   892
GO / SC Num                    0
Terry Stop ID                  0
Stop Resolution                0
Weapon Type                    0
Officer ID                     0
Officer YOB                   17
Officer Gender                 0
Officer Race                   0
Subject Perceived Race         0
Subject Perceived Gender       0
Reported Date                  0
Reported Time                  0
Initial Call Type           9854
Final Call Type             9854
Call Type                   9854
Officer Squad                407
Arrest Flag                    0
Frisk Flag                     0
Precinct                    9854
Sector                      9854
Beat                        9854
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26042 entries, 0 to 26041
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject Age Group         26042 non-null  object 
 1   Subject ID                25150 non-null  float64
 2   GO / SC Num               26042 non-null  int64  
 3   Terry Stop ID             26042 non-null  int64  
 4   Stop Resolution           26042 non-null  object 
 5   Weapon Type               26042 non-null  object 
 6   Officer ID                26042 non-null  int64  
 7   Officer YOB               26025 non-null  float64
 8   Officer Gender            26042 non-null  object 
 9   Officer Race              26042 non-null  object 
 10  Subject Perceived Race    26042 non-null  object 
 11  Subject Perceived Gender  26042 non-null  object 
 12  Reported Date             26042 non-null  object 
 13  Reported Time             26042 non-null  object 
 14  Initia

In [6]:
df.head()

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,36 - 45,,20150000088359,10012,-,,1735,1977.0,M,White,...,09:58:00.0000000,WARRANT - FELONY PICKUP,--WARRANT SERVICES - FELONY,ONVIEW,,N,N,East,E,E2
1,26 - 35,,20150000087858,10068,GO Report,,1561,1984.0,M,White,...,19:08:00.0000000,THREATS (INCLS IN-PERSON/BY PHONE/IN WRITING),--DISTURBANCE - OTHER,911,,N,N,East,G,G3
2,18 - 25,,20150000001461,10103,Street Check,,1539,1973.0,M,White,...,01:32:00.0000000,,,,,N,N,,,
3,18 - 25,,20150000001461,10104,Street Check,,1539,1973.0,M,White,...,01:35:00.0000000,,,,,N,N,,,
4,26 - 35,,20150000103696,10207,Street Check,,1539,1973.0,M,White,...,21:08:00.0000000,PROPERTY - DAMAGE,--PROPERTY DEST (DAMG),911,,N,N,East,E,E2


## Cleaning

#### 1. Subject Age Group

In [7]:
df['Subject Age Group'].value_counts()

26 - 35         8473
18 - 25         5810
36 - 45         5131
46 - 55         3244
1 - 17          1390
56 and Above    1158
-                836
Name: Subject Age Group, dtype: int64

#### 2. Subject ID (repeat offenders)

In [8]:
repeat_offenders = df['Subject ID'].value_counts() > 2
medium_offenders = df['Subject ID'].value_counts() > 4
career_offenders = df['Subject ID'].value_counts() > 6

In [9]:
career_offenders.value_counts()


False    19738
True        84
Name: Subject ID, dtype: int64

#### 3. GO / SC Num (identifier for type of stop)

In [10]:
print(df['GO / SC Num'].value_counts())

# With 19,620 unique codes, this column seems like it can be dropped
df.drop('GO / SC Num', axis=1, inplace=True)

20150000190790    16
20160000378750    16
20180000134604    14
20170000132836    13
20160000001326    12
                  ..
20170000298261     1
20170000212255     1
20170000000403     1
20170000001286     1
20170000294189     1
Name: GO / SC Num, Length: 19620, dtype: int64


#### 4. Terry Stop ID (unique identifier for each of stop)

In [11]:
df['Terry Stop ID'].value_counts()

12282    1
38232    1
21824    1
23873    1
17730    1
        ..
37607    1
10992    1
15090    1
13043    1
10245    1
Name: Terry Stop ID, Length: 26042, dtype: int64

#### 5. Stop Resolution (final outcome) - target variable

In [12]:
df['Stop Resolution'].value_counts()
len(df['Stop Resolution'].unique())

6

#### 6. Weapon Type

In [13]:
df['Weapon Type'].value_counts()
len(df['Weapon Type'].unique())

12

In [14]:
weapon_map = {
    'None': 'None',
    '-': 'Other',
    'Lethal Cutting Instrument':'Lethal Cutting Instrument',
    'Handgun': 'Firearm',
    'Firearm Other': 'Firearm',
    'Firearm (unk type)': 'Firearm',
    'Shotgun': 'Firearm',
    'Rifle': 'Firearm',
    'Club': 'Club, Blackjack, Brass Knuckles',
    'Blackjack': 'Club, Blackjack, Brass Knuckles',
    'Brass Knuckles': 'Club, Blackjack, Brass Knuckles'
}

In [15]:
df['Weapon Type'] = df['Weapon Type'].map(weapon_map)

In [16]:
df['Weapon Type'].value_counts()

# 34 total "NaN" values
df['Weapon Type'].isna().sum()
df['Weapon Type'].unique()

array(['None', 'Lethal Cutting Instrument', 'Firearm', nan, 'Other',
       'Club, Blackjack, Brass Knuckles'], dtype=object)

#### 7. Officer ID (unique identifier)

In [17]:
df['Officer ID'].value_counts()

# 986 reporting officers
len(df['Officer ID'].unique())

986

#### 8. Officer YOB

In [18]:
df['Officer YOB'].isna().sum()
# Does the age of the officer have any effect on the result of the terry stop

# Oldest officer born in 1948 (must drop 5 stops where Officer YOB==1900)
sorted(df['Officer YOB'])

too_old = df['Officer YOB']==1900
df.drop(df[too_old].index, inplace=True)

In [19]:
df.shape

(26037, 22)

In [20]:
currentYear = datetime.now().year

In [21]:
df['Officer_Age'] = (currentYear - df['Officer YOB'])
df['Officer_Age'].isna().sum()

17

In [64]:
df.head()

Unnamed: 0,Subject_Age_Group,Subject_ID,Terry_Stop_ID,Stop_Resolution,Weapon_Type,Officer_ID,Officer_YOB,Officer_Gender,Officer_Race,Subject_Perceived_Race,Subject_Perceived_Gender,Reported_Date,Arrest_Flag,Frisk_Flag,Officer_Age,Reported_TOD,Reported_Hour
892,18 - 25,4.0,13435,GO Report,,1559,1985.0,M,Caucasian,African American,Female,2015-08-07,N,N,35.0,21:45:00,21
893,-,5.0,32980,Street Check,,2153,1992.0,M,Caucasian,Unknown,Male,2017-11-12,N,N,28.0,01:45:00,1
894,46 - 55,6.0,39041,GO Report,,653,1961.0,M,Caucasian,African American,Male,2018-03-18,N,Y,59.0,21:49:00,21
895,46 - 55,13.0,40227,Arrest with GO or Supplemental,,1701,1978.0,M,Caucasian,Caucasian,Female,2018-05-15,Y,N,42.0,07:53:00,7
896,26 - 35,14.0,15894,GO Report,,1465,1983.0,M,Native American,Caucasian,Male,2015-10-30,N,N,37.0,11:31:00,11


#### 9. Officer Gender

In [23]:
df['Officer Gender'].value_counts()

M    23151
F     2886
Name: Officer Gender, dtype: int64

#### 10. Officer Race

In [24]:
df['Officer Race'].value_counts()

White                            20399
Hispanic or Latino                1229
Two or More Races                 1158
Black or African American         1077
Asian                              956
Not Specified                      640
Nat Hawaiian/Oth Pac Islander      308
American Indian/Alaska Native      253
Unknown                             17
Name: Officer Race, dtype: int64

In [25]:
race_map = {
    'White': 'Caucasian',
    'Hispanic or Latino': 'Hispanic',
    'Two or More Races': 'Unknown',
    'Black or African American': 'African American',
    'Asian': 'Asian',
    'Not Specified': 'Unknown',
    'Nat Hawaiian/Oth Pac Islander': 'Native Islander',
    'American Indian/Alaska Native': 'Native American',
    'Unknown': 'Unknown'
}

In [26]:
df['Officer Race'] = df['Officer Race'].map(race_map)

#### 11. Subject Perceived Race 

In [27]:
df['Subject Perceived Race'].value_counts()

White                               12834
Black                                7998
Hispanic                             1225
Unknown                              1193
American Indian / Alaskan Native      843
Asian                                 793
Multi-Racial                          669
-                                     376
Other                                 106
Name: Subject Perceived Race, dtype: int64

In [28]:
p_race_map = {
    'White': 'Caucasian',
    'Black': 'African American',
    'Hispanic': 'Hispanic',
    'Unknown': 'Unknown',
    'American Indian / Alaskan Native': 'Native American',
    'Asian': 'Asian',
    'Multi-Racial': 'Unknown',
    '-': 'Unknown',
    'Other': 'Unknown'
}

In [29]:
df['Subject Perceived Race'] = df['Subject Perceived Race'].map(p_race_map)

In [30]:
df['Subject Perceived Race'].value_counts()

Caucasian           12834
African American     7998
Unknown              2344
Hispanic             1225
Native American       843
Asian                 793
Name: Subject Perceived Race, dtype: int64

#### 12. Subject Perceived Gender

In [31]:
df['Subject Perceived Gender'].value_counts()

Male                   20207
Female                  5383
Unable to Determine      262
-                        185
Name: Subject Perceived Gender, dtype: int64

In [32]:
p_gender_map = {
    'Male': 'Male',
    'Female': 'Female',
    'Unable to Determine': 'Unable to Determine',
    '-': 'Unable to Determine'
}

In [33]:
df['Subject Perceived Gender'] = df['Subject Perceived Gender'].map(p_gender_map)

In [34]:
df['Subject Perceived Gender'].value_counts()

Male                   20207
Female                  5383
Unable to Determine      447
Name: Subject Perceived Gender, dtype: int64

#### 13. Reported Date (scrape weather forecast)

In [35]:
# To use for weather historic weather forecast
df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='coerce', format= '%Y-%m-%d', )

#### 14. Time-of-Day

In [36]:
# Converted arrest 'Reported Time' into Time-of-Day ('Reported TOD')
df['Reported TOD'] = pd.to_datetime(df['Reported Time'], format= '%H:%M:%S.%f').dt.time
df.drop('Reported Time', axis=1, inplace=True)

In [37]:
# np.logical_or(df['Reported TOD'] < 6:00 or df['Reported TOD'] > 20:00)
df['Reported Hour'] = df['Reported TOD'].apply(lambda x: x.hour)

In [38]:
early_morning = df['Reported Hour'] < 6
evening = df['Reported Hour'] > 18
night = df['Reported Hour'] > 20

In [39]:
df[early_morning].head()

Unnamed: 0,Subject Age Group,Subject ID,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,...,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat,Officer_Age,Reported TOD,Reported Hour
2,18 - 25,,10103,Street Check,,1539,1973.0,M,Caucasian,Native American,...,,,N,N,,,,47.0,01:32:00,1
3,18 - 25,,10104,Street Check,,1539,1973.0,M,Caucasian,Native American,...,,,N,N,,,,47.0,01:35:00,1
5,-,,10209,Street Check,,1651,1963.0,M,Caucasian,Unknown,...,,,N,N,,,,57.0,04:55:00,4
6,26 - 35,,10289,Street Check,,1848,1986.0,M,Unknown,African American,...,,,N,N,,,,34.0,02:33:00,2
7,26 - 35,,10290,Street Check,,1848,1986.0,M,Unknown,Caucasian,...,,,N,N,,,,34.0,02:36:00,2


#### 15. Initial Call Type, 16. Final Call Type, 17. Call Type
#### Drop all

In [None]:
df.drop(['Initial Call Type', 'Final Call Type', 'Call Type'], axis=1, inplace=True)

In [44]:
df.head()

Unnamed: 0,Subject Age Group,Subject ID,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Reported Date,Arrest Flag,Frisk Flag,Officer_Age,Reported TOD,Reported Hour
0,36 - 45,,10012,-,,1735,1977.0,M,Caucasian,African American,Male,2015-03-18,N,N,43.0,09:58:00,9
1,26 - 35,,10068,GO Report,,1561,1984.0,M,Caucasian,African American,Male,2015-03-17,N,N,36.0,19:08:00,19
2,18 - 25,,10103,Street Check,,1539,1973.0,M,Caucasian,Native American,Male,2015-03-19,N,N,47.0,01:32:00,1
3,18 - 25,,10104,Street Check,,1539,1973.0,M,Caucasian,Native American,Female,2015-03-19,N,N,47.0,01:35:00,1
4,26 - 35,,10207,Street Check,,1539,1973.0,M,Caucasian,Caucasian,Male,2015-03-30,N,N,47.0,21:08:00,21


#### 18. Officer Squad, 21. Precinct, 22. Sector, 23. Beat
#### All dropped

In [42]:
df.drop(['Officer Squad', 'Precinct', 'Sector', 'Beat'], axis=1, inplace=True)

In [43]:
df.head()

Unnamed: 0,Subject Age Group,Subject ID,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Reported Date,Arrest Flag,Frisk Flag,Officer_Age,Reported TOD,Reported Hour
0,36 - 45,,10012,-,,1735,1977.0,M,Caucasian,African American,Male,2015-03-18,N,N,43.0,09:58:00,9
1,26 - 35,,10068,GO Report,,1561,1984.0,M,Caucasian,African American,Male,2015-03-17,N,N,36.0,19:08:00,19
2,18 - 25,,10103,Street Check,,1539,1973.0,M,Caucasian,Native American,Male,2015-03-19,N,N,47.0,01:32:00,1
3,18 - 25,,10104,Street Check,,1539,1973.0,M,Caucasian,Native American,Female,2015-03-19,N,N,47.0,01:35:00,1
4,26 - 35,,10207,Street Check,,1539,1973.0,M,Caucasian,Caucasian,Male,2015-03-30,N,N,47.0,21:08:00,21


#### 20. Frisk Flag

In [45]:
df['Frisk Flag'].value_counts()

N    20097
Y     5571
-      369
Name: Frisk Flag, dtype: int64

In [46]:
df.head()

Unnamed: 0,Subject Age Group,Subject ID,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Reported Date,Arrest Flag,Frisk Flag,Officer_Age,Reported TOD,Reported Hour
0,36 - 45,,10012,-,,1735,1977.0,M,Caucasian,African American,Male,2015-03-18,N,N,43.0,09:58:00,9
1,26 - 35,,10068,GO Report,,1561,1984.0,M,Caucasian,African American,Male,2015-03-17,N,N,36.0,19:08:00,19
2,18 - 25,,10103,Street Check,,1539,1973.0,M,Caucasian,Native American,Male,2015-03-19,N,N,47.0,01:32:00,1
3,18 - 25,,10104,Street Check,,1539,1973.0,M,Caucasian,Native American,Female,2015-03-19,N,N,47.0,01:35:00,1
4,26 - 35,,10207,Street Check,,1539,1973.0,M,Caucasian,Caucasian,Male,2015-03-30,N,N,47.0,21:08:00,21


## Relabel Column Headers

In [47]:
df.columns

Index(['Subject Age Group', 'Subject ID', 'Terry Stop ID', 'Stop Resolution',
       'Weapon Type', 'Officer ID', 'Officer YOB', 'Officer Gender',
       'Officer Race', 'Subject Perceived Race', 'Subject Perceived Gender',
       'Reported Date', 'Arrest Flag', 'Frisk Flag', 'Officer_Age',
       'Reported TOD', 'Reported Hour'],
      dtype='object')

In [50]:
columns_nospace = ['Subject_Age_Group', 'Subject_ID', 'Terry_Stop_ID', 'Stop_Resolution',
       'Weapon_Type', 'Officer_ID', 'Officer_YOB', 'Officer_Gender',
       'Officer_Race', 'Subject_Perceived_Race', 'Subject_Perceived_Gender',
       'Reported_Date', 'Arrest_Flag', 'Frisk_Flag', 'Officer_Age', 'Reported_TOD',
       'Reported_Hour']

In [51]:
df.columns = columns_nospace

In [52]:
df.head()

Unnamed: 0,Subject_Age_Group,Subject_ID,Terry_Stop_ID,Stop_Resolution,Weapon_Type,Officer_ID,Officer_YOB,Officer_Gender,Officer_Race,Subject_Perceived_Race,Subject_Perceived_Gender,Reported_Date,Arrest_Flag,Frisk_Flag,Officer_Age,Reported_TOD,Reported_Hour
0,36 - 45,,10012,-,,1735,1977.0,M,Caucasian,African American,Male,2015-03-18,N,N,43.0,09:58:00,9
1,26 - 35,,10068,GO Report,,1561,1984.0,M,Caucasian,African American,Male,2015-03-17,N,N,36.0,19:08:00,19
2,18 - 25,,10103,Street Check,,1539,1973.0,M,Caucasian,Native American,Male,2015-03-19,N,N,47.0,01:32:00,1
3,18 - 25,,10104,Street Check,,1539,1973.0,M,Caucasian,Native American,Female,2015-03-19,N,N,47.0,01:35:00,1
4,26 - 35,,10207,Street Check,,1539,1973.0,M,Caucasian,Caucasian,Male,2015-03-30,N,N,47.0,21:08:00,21


## Drop Null values (mostly Subject ID)

In [53]:
df.shape

(26037, 17)

In [60]:
print(df.isna().sum().sum())

df.isna().sum()

960


Subject_Age_Group             0
Subject_ID                  892
Terry_Stop_ID                 0
Stop_Resolution               0
Weapon_Type                  34
Officer_ID                    0
Officer_YOB                  17
Officer_Gender                0
Officer_Race                  0
Subject_Perceived_Race        0
Subject_Perceived_Gender      0
Reported_Date                 0
Arrest_Flag                   0
Frisk_Flag                    0
Officer_Age                  17
Reported_TOD                  0
Reported_Hour                 0
dtype: int64

In [61]:
# Drop 960 records with missing data (mainly Subject_ID)
df.dropna(inplace=True)

In [62]:
df.shape

(25095, 17)

## Save to .csv for importing to other notebooks

In [63]:
df.to_csv('terry-stops-cleaned')