# Mod_5_Project

## Classify whether a traffic stop will end in an arrest or not

Left off at p_race_map

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

In [2]:
# Display all expressions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

In [3]:
df = pd.read_csv('terry-stops.csv')

In [4]:
# Adjust jupyter's display limits
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 200)

In [5]:
df.shape

(26042, 23)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26042 entries, 0 to 26041
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject Age Group         26042 non-null  object 
 1   Subject ID                25150 non-null  float64
 2   GO / SC Num               26042 non-null  int64  
 3   Terry Stop ID             26042 non-null  int64  
 4   Stop Resolution           26042 non-null  object 
 5   Weapon Type               26042 non-null  object 
 6   Officer ID                26042 non-null  int64  
 7   Officer YOB               26025 non-null  float64
 8   Officer Gender            26042 non-null  object 
 9   Officer Race              26042 non-null  object 
 10  Subject Perceived Race    26042 non-null  object 
 11  Subject Perceived Gender  26042 non-null  object 
 12  Reported Date             26042 non-null  object 
 13  Reported Time             26042 non-null  object 
 14  Initia

In [7]:
df.head()

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Reported Date,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,36 - 45,,20150000088359,10012,-,,1735,1977.0,M,White,Black,Male,2015-03-18,09:58:00.0000000,WARRANT - FELONY PICKUP,--WARRANT SERVICES - FELONY,ONVIEW,,N,N,East,E,E2
1,26 - 35,,20150000087858,10068,GO Report,,1561,1984.0,M,White,Black,Male,2015-03-17,19:08:00.0000000,THREATS (INCLS IN-PERSON/BY PHONE/IN WRITING),--DISTURBANCE - OTHER,911,,N,N,East,G,G3
2,18 - 25,,20150000001461,10103,Street Check,,1539,1973.0,M,White,American Indian / Alaskan Native,Male,2015-03-19,01:32:00.0000000,,,,,N,N,,,
3,18 - 25,,20150000001461,10104,Street Check,,1539,1973.0,M,White,American Indian / Alaskan Native,Female,2015-03-19,01:35:00.0000000,,,,,N,N,,,
4,26 - 35,,20150000103696,10207,Street Check,,1539,1973.0,M,White,White,Male,2015-03-30,21:08:00.0000000,PROPERTY - DAMAGE,--PROPERTY DEST (DAMG),911,,N,N,East,E,E2


## Cleaning

#### 1. Subject Age Group

In [8]:
df['Subject Age Group'].value_counts()

26 - 35         8473
18 - 25         5810
36 - 45         5131
46 - 55         3244
1 - 17          1390
56 and Above    1158
-                836
Name: Subject Age Group, dtype: int64

#### 2. Subject ID (repeat offenders)

In [9]:
repeat_offenders = df['Subject ID'].value_counts() > 2
medium_offenders = df['Subject ID'].value_counts() > 4
career_offenders = df['Subject ID'].value_counts() > 6

In [10]:
career_offenders.value_counts()


False    19738
True        84
Name: Subject ID, dtype: int64

#### 3. GO / SC Num (identifier for type of stop)

In [11]:
print(df['GO / SC Num'].value_counts())

# With 19,620 unique codes, this column seems like it can be dropped
df.drop('GO / SC Num', axis=1, inplace=True)

20150000190790    16
20160000378750    16
20180000134604    14
20170000132836    13
20160000001326    12
                  ..
20170000298261     1
20170000212255     1
20170000000403     1
20170000001286     1
20170000294189     1
Name: GO / SC Num, Length: 19620, dtype: int64


#### 4. Terry Stop ID (unique identifier for each of stop)

In [12]:
df['Terry Stop ID'].value_counts()

12282    1
38232    1
21824    1
23873    1
17730    1
        ..
37607    1
10992    1
15090    1
13043    1
10245    1
Name: Terry Stop ID, Length: 26042, dtype: int64

#### 5. Stop Resolution (final outcome) - target variable

In [13]:
df['Stop Resolution'].value_counts()
len(df['Stop Resolution'].unique())

6

#### 6. Weapon Type

In [14]:
df['Weapon Type'].value_counts()
len(df['Weapon Type'].unique())

12

In [15]:
weapon_map = {
    'None': 'None',
    '-': 'Other',
    'Lethal Cutting Instrument':'Lethal Cutting Instrument',
    'Handgun': 'Firearm',
    'Firearm Other': 'Firearm',
    'Firearm (unk type)': 'Firearm',
    'Shotgun': 'Firearm',
    'Rifle': 'Firearm',
    'Club': 'Club, Blackjack, Brass Knuckles',
    'Blackjack': 'Club, Blackjack, Brass Knuckles',
    'Brass Knuckles': 'Club, Blackjack, Brass Knuckles'
}

In [16]:
df['Weapon Type'] = df['Weapon Type'].map(weapon_map)

In [17]:
df['Weapon Type'].value_counts()

# 34 total "NaN" values
df['Weapon Type'].isna().sum()
df['Weapon Type'].unique()

array(['None', 'Lethal Cutting Instrument', 'Firearm', nan, 'Other',
       'Club, Blackjack, Brass Knuckles'], dtype=object)

#### 7. Officer ID (unique identifier)

In [18]:
df['Officer ID'].value_counts()

# 986 reporting officers
len(df['Officer ID'].unique())

986

#### 8. Officer YOB

In [19]:
df['Officer YOB'].isna().sum()

# Does the age of the officer have any effect on the result of the terry stop
# df['Officer YOB'].value_counts()

# Oldest officer born in 1948 (must drop 5 arrests where YOB==1900)
# sorted(df['Officer YOB'])

17

#### 9. Officer Gender

In [20]:
df['Officer Gender'].value_counts()

M    23151
F     2886
N        5
Name: Officer Gender, dtype: int64

#### 10. Officer Race

In [21]:
df['Officer Race'].value_counts()

White                            20399
Hispanic or Latino                1229
Two or More Races                 1158
Black or African American         1077
Asian                              956
Not Specified                      640
Nat Hawaiian/Oth Pac Islander      308
American Indian/Alaska Native      253
Unknown                             22
Name: Officer Race, dtype: int64

In [22]:
race_map = {
    'White': 'Caucasian',
    'Hispanic or Latino': 'Hispanic',
    'Two or More Races': 'Unknown',
    'Black or African American': 'African American',
    'Asian': 'Asian',
    'Not Specified': 'Unknown',
    'Nat Hawaiian/Oth Pac Islander': 'Native Islander',
    'American Indian/Alaska Native': 'Native American',
    'Unknown': 'Unknown'
}

In [23]:
df['Officer Race'] = df['Officer Race'].map(race_map)

#### 11. Subject Perceived Race 

In [24]:
df['Subject Perceived Race'].value_counts()

White                               12837
Black                                7999
Hispanic                             1225
Unknown                              1194
American Indian / Alaskan Native      843
Asian                                 793
Multi-Racial                          669
-                                     376
Other                                 106
Name: Subject Perceived Race, dtype: int64

In [25]:
p_race_map = {
    'White': 'Caucasian',
    'Black': 'African American',
    'Hispanic': 'Hispanic',
    'Unknown': 'Unknown',
    'American Indian / Alaskan Native': 'Native American',
    'Asian': 'Asian',
    'Multi-Racial': 'Unknown',
    '-': 'Unknown',
    'Other': 'Unknown'
}

In [26]:
df['Subject Perceived Race'] = df['Subject Perceived Race'].map(p_race_map)

In [27]:
df['Subject Perceived Race'].value_counts()

Caucasian           12837
African American     7999
Unknown              2345
Hispanic             1225
Native American       843
Asian                 793
Name: Subject Perceived Race, dtype: int64

#### 12. Subject Perceived Gender

In [28]:
df['Subject Perceived Gender'].value_counts()

Male                   20210
Female                  5385
Unable to Determine      262
-                        185
Name: Subject Perceived Gender, dtype: int64

In [29]:
p_gender_map = {
    'Male': 'Male',
    'Female': 'Female',
    'Unable to Determine': 'Unable to Determine',
    '-': 'Unable to Determine'
}

In [30]:
df['Subject Perceived Gender'] = df['Subject Perceived Gender'].map(p_gender_map)

In [31]:
df['Subject Perceived Gender'].value_counts()

Male                   20210
Female                  5385
Unable to Determine      447
Name: Subject Perceived Gender, dtype: int64

#### 13. Reported Date (scrape weather forecast)

In [32]:
# To use for weather historic weather forecast
df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='coerce', format= '%Y-%m-%d', )

#### 14. Time-of-Day

In [33]:
# Converted arrest 'Reported Time' into Time-of-Day ('Reported TOD')
df['Reported TOD'] = pd.to_datetime(df['Reported Time'], format= '%H:%M:%S.%f').dt.time
df.drop('Reported Time', axis=1, inplace=True)

In [34]:
# np.logical_or(df['Reported TOD'] < 6:00 or df['Reported TOD'] > 20:00)
df['Reported Hour'] = df['Reported TOD'].apply(lambda x: x.hour)

In [35]:
early_morning = df['Reported Hour'] < 6
evening = df['Reported Hour'] > 18
night = df['Reported Hour'] > 20

In [36]:
df[early_morning].head()

Unnamed: 0,Subject Age Group,Subject ID,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Reported Date,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat,Reported TOD,Reported Hour
2,18 - 25,,10103,Street Check,,1539,1973.0,M,Caucasian,Native American,Male,2015-03-19,,,,,N,N,,,,01:32:00,1
3,18 - 25,,10104,Street Check,,1539,1973.0,M,Caucasian,Native American,Female,2015-03-19,,,,,N,N,,,,01:35:00,1
5,-,,10209,Street Check,,1651,1963.0,M,Caucasian,Unknown,Unable to Determine,2015-04-01,,,,,N,N,,,,04:55:00,4
6,26 - 35,,10289,Street Check,,1848,1986.0,M,Unknown,African American,Male,2015-04-10,,,,,N,N,,,,02:33:00,2
7,26 - 35,,10290,Street Check,,1848,1986.0,M,Unknown,Caucasian,Female,2015-04-10,,,,,N,N,,,,02:36:00,2


#### 15. Initial Call Type, 16. Final Call Type, 17. Call Type
#### Drop all

In [37]:
df.drop(['Initial Call Type', 'Final Call Type', 'Call Type'], axis=1, inplace=True)

#### 18. Officer Squad, 21. Precinct, 22. Sector, 23. Beat
#### All dropped

In [38]:
df2 = df.drop(['Officer Squad', 'Precinct', 'Sector', 'Beat'], axis=1)

#### 20. Frisk Flag

In [39]:
df2['Frisk Flag'].value_counts()

N    20102
Y     5571
-      369
Name: Frisk Flag, dtype: int64

In [40]:
df2.head()

Unnamed: 0,Subject Age Group,Subject ID,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Reported Date,Arrest Flag,Frisk Flag,Reported TOD,Reported Hour
0,36 - 45,,10012,-,,1735,1977.0,M,Caucasian,African American,Male,2015-03-18,N,N,09:58:00,9
1,26 - 35,,10068,GO Report,,1561,1984.0,M,Caucasian,African American,Male,2015-03-17,N,N,19:08:00,19
2,18 - 25,,10103,Street Check,,1539,1973.0,M,Caucasian,Native American,Male,2015-03-19,N,N,01:32:00,1
3,18 - 25,,10104,Street Check,,1539,1973.0,M,Caucasian,Native American,Female,2015-03-19,N,N,01:35:00,1
4,26 - 35,,10207,Street Check,,1539,1973.0,M,Caucasian,Caucasian,Male,2015-03-30,N,N,21:08:00,21


## Relabel Column Headers

In [44]:
df2.columns

Index(['Subject Age Group', 'Subject ID', 'Terry Stop ID', 'Stop Resolution',
       'Weapon Type', 'Officer ID', 'Officer YOB', 'Officer Gender',
       'Officer Race', 'Subject Perceived Race', 'Subject Perceived Gender',
       'Reported Date', 'Arrest Flag', 'Frisk Flag', 'Reported TOD',
       'Reported Hour'],
      dtype='object')

In [45]:
columns_nospace = ['Subject_Age_Group', 'Subject_ID', 'Terry_Stop_ID', 'Stop_Resolution',
       'Weapon_Type', 'Officer_ID', 'Officer_YOB', 'Officer_Gender',
       'Officer_Race', 'Subject_Perceived_Race', 'Subject_Perceived_Gender',
       'Reported_Date', 'Arrest_Flag', 'Frisk_Flag', 'Reported_TOD',
       'Reported_Hour']

In [47]:
df2.columns = columns_nospace

In [48]:
df2.head()

Unnamed: 0,Subject_Age_Group,Subject_ID,Terry_Stop_ID,Stop_Resolution,Weapon_Type,Officer_ID,Officer_YOB,Officer_Gender,Officer_Race,Subject_Perceived_Race,Subject_Perceived_Gender,Reported_Date,Arrest_Flag,Frisk_Flag,Reported_TOD,Reported_Hour
0,36 - 45,,10012,-,,1735,1977.0,M,Caucasian,African American,Male,2015-03-18,N,N,09:58:00,9
1,26 - 35,,10068,GO Report,,1561,1984.0,M,Caucasian,African American,Male,2015-03-17,N,N,19:08:00,19
2,18 - 25,,10103,Street Check,,1539,1973.0,M,Caucasian,Native American,Male,2015-03-19,N,N,01:32:00,1
3,18 - 25,,10104,Street Check,,1539,1973.0,M,Caucasian,Native American,Female,2015-03-19,N,N,01:35:00,1
4,26 - 35,,10207,Street Check,,1539,1973.0,M,Caucasian,Caucasian,Male,2015-03-30,N,N,21:08:00,21


## Save to .csv for importing to other notebooks

In [49]:
df2.to_csv('terry-stops-cleaned')