<a href="https://colab.research.google.com/github/gsheara/SRCC-Project/blob/main/SRCC_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data import and prep

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

In [5]:
# code for plotting results from in-class Model Selection notebook
def plot_results(y_train, y_test, y_pred_train, y_pred_test):
  one_train = np.linspace(y_train.min(), y_train.max(), 100)
  one_test = np.linspace(y_test.min(), y_test.max(), 100)
  rmse_test = np.sqrt(np.mean((y_pred_test - y_test)**2))
  rmse_train = np.sqrt(np.mean((y_pred_train - y_train)**2))

  fig,(ax1, ax2) = plt.subplots(1, 2)
  ax1.plot(y_pred_train, y_train, 'o')
  ax1.plot(one_train, one_train, '-', label=f"RMSE = {rmse_train:.2f} points")
  ax1.set_title("Training Set")
  ax1.set_xlabel("Predicted ACT Score")
  ax1.set_ylabel("Actual ACT Score")
  ax1.legend()

  ax2.plot(y_pred_test, y_test, 'o')
  ax2.plot(one_test, one_test, '-', label=f"RMSE = {rmse_test:.2f} points")
  ax2.legend()
  ax2.set_title("Test Set")
  ax2.set_xlabel("Predicted ACT Score")
  ax2.set_ylabel("Actual ACT Score")
  fig.set_size_inches(12, 5)

### Chicago

In [6]:
df_chi_raw = pd.read_csv('https://raw.githubusercontent.com/gsheara/SRCC-Project/main/Chicago_UOF.csv')
df_chi_raw.tail()

Unnamed: 0,REPORT NO,INCIDENTDATETIME,STREET NO,DIR,STREET NAME,CITY,STATE,ZIP CODE,LOCATION CODE,BEAT,...,SUBJECT_DNA,ATTACK_SHOT,ATTACK_SHOT_AT,ATTACK_STABBED_CUT,ATTACK_STRUCK_BLUNT_FORCE,ATTACK_OTHER,NOTIFICATION_CPIC_I,NOTIFICATION_DISTOFOCCURR_I,NOTIFICATION_IMMEDSUPER_I,NOTIFICATION_OEMC_I
3483,2024-00002,31-DEC-2023 21:12,9XX,E,104TH ST,CHICAGO,IL,60628,APARTMENT,512,...,,,,,,,,Y,Y,Y
3484,2024-00000,31-DEC-2023 21:12,9XX,E,104TH ST,CHICAGO,IL,60628,APARTMENT,512,...,,,,,,,,Y,Y,Y
3485,2024-00006,31-DEC-2023 21:45,XX,W,ELM ST,CHICAGO,IL,60610,STREET,1824,...,,,,,,,,Y,Y,Y
3486,2024-00005,31-DEC-2023 21:45,XX,W,ELM ST,CHICAGO,IL,60610,STREET,1824,...,,,,,,,,Y,Y,Y
3487,2024-00003,31-DEC-2023 22:50,14XX,N,LOCKWOOD AVE,CHICAGO,IL,60651,STREET,2532,...,,,,,,,,,Y,Y


In [7]:
all_cols = ['ZIP CODE', 'LOCATION CODE', 'WEATHER_CONDITION', 'PATROL_TYPE', 'MEMBERSEX', 'MEMBERRACE', 'SUBJSEX', 'SUB_RACE', 'SUBJECT_HOSPITALIZED']
df_chi = df_chi_raw[all_cols]

name_dict = {
    'ZIP CODE':'zip_code',
    'LOCATION CODE': 'loc_code',
    'WEATHER_CONDITION': 'weather',
    'PATROL_TYPE': 'patrol',
    'MEMBERSEX': 'mem_sex',
    'MEMBERRACE': 'mem_race',
    'SUBJSEX': 'sub_sex',
    'SUB_RACE': 'sub_race',
    'SUBJECT_HOSPITALIZED': 'hospitalized'

}
df_chi.rename(columns=name_dict,
          inplace=True)

df_chi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3488 entries, 0 to 3487
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   zip_code      3488 non-null   int64 
 1   loc_code      3488 non-null   object
 2   weather       3385 non-null   object
 3   patrol        3488 non-null   object
 4   mem_sex       3488 non-null   object
 5   mem_race      3488 non-null   object
 6   sub_sex       3377 non-null   object
 7   sub_race      3437 non-null   object
 8   hospitalized  3488 non-null   object
dtypes: int64(1), object(8)
memory usage: 245.4+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chi.rename(columns=name_dict,


In [8]:
df_chi['hospitalized'].unique()
df_chi = df_chi.replace({'hospitalized': {'Yes': 1, 'No': 0}})
df_chi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3488 entries, 0 to 3487
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   zip_code      3488 non-null   int64 
 1   loc_code      3488 non-null   object
 2   weather       3385 non-null   object
 3   patrol        3488 non-null   object
 4   mem_sex       3488 non-null   object
 5   mem_race      3488 non-null   object
 6   sub_sex       3377 non-null   object
 7   sub_race      3437 non-null   object
 8   hospitalized  3488 non-null   int64 
dtypes: int64(2), object(7)
memory usage: 245.4+ KB


In [9]:
# population data
chi_pop = pd.read_csv('https://raw.githubusercontent.com/gsheara/SRCC-Project/main/Chicago_Population.csv')

cols = ['Population - Total', 'Population - Age 0-17', 'Population - Age 18-29', 'Population - Age 30-39',
        'Population - Age 40-49', 'Population - Age 50-59', 'Population - Age 60-69', 'Population - Age 70-79',
        'Population - Age 80+', 'Population - Age 0-4', 'Population - Age 5-11', 'Population - Age 12-17', 'Population - Age 5+',
        'Population - Age 18+', 'Population - Age 65+', 'Population - Female', 'Population - Male', 'Population - Latinx',
        'Population - Asian Non-Latinx', 'Population - Black Non-Latinx', 'Population - White Non-Latinx', 'Population - Other Race Non-Latinx'
]


chi_pop = chi_pop.drop([0, 52])
chi_pop[cols] = chi_pop[cols].astype(float)
chi_pop['Geography'] = chi_pop['Geography'].astype(int)
chi_pop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58 entries, 1 to 59
Data columns (total 23 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Geography                           58 non-null     int64  
 1   Population - Total                  58 non-null     float64
 2   Population - Age 0-17               58 non-null     float64
 3   Population - Age 18-29              58 non-null     float64
 4   Population - Age 30-39              58 non-null     float64
 5   Population - Age 40-49              58 non-null     float64
 6   Population - Age 50-59              58 non-null     float64
 7   Population - Age 60-69              58 non-null     float64
 8   Population - Age 70-79              58 non-null     float64
 9   Population - Age 80+                58 non-null     float64
 10  Population - Age 0-4                58 non-null     float64
 11  Population - Age 5-11               58 non-null     

In [10]:
df_chi = df_chi.dropna()
df_chi.head()

Unnamed: 0,zip_code,loc_code,weather,patrol,mem_sex,mem_race,sub_sex,sub_race,hospitalized
0,60645,SIDEWALK,CLEAR,CAR,M,WHITE,MALE,BLACK,0
1,60638,APARTMENT,CLEAR,OTH,F,WHITE HISPANIC,MALE,WHITE HISPANIC,1
2,60610,STREET,CLEAR,FOOT,M,WHITE,FEMALE,WHITE,0
5,60611,HOTEL / MOTEL,CLEAR,CAR,M,WHITE,MALE,BLACK,0
6,60611,HOTEL / MOTEL,RAIN,CAR,M,WHITE,MALE,BLACK,0


In [11]:
df_chicago = df_chi.merge(chi_pop, how = 'left', left_on='zip_code', right_on='Geography')
df_chicago = df_chicago.dropna()

name_dict1 = {
    'Population - Total' : 'total_population',
    'Population - Age 0-17' : '0-17',
    'Population - Age 18-29' : '18-29',
    'Population - Age 30-39': '30-39',
    'Population - Age 40-49': '40-49',
    'Population - Age 50-59': '50-59',
    'Population - Age 60-69': '60-69',
    'Population - Age 70-79': '70-79',
    'Population - Age 80+': '80+',
    'Population - Age 0-4': '0-4',
    'Population - Age 5-11': '5-11',
    'Population - Age 12-17': '12-17',
    'Population - Age 5+': '5+',
    'Population - Age 18+': '18+',
    'Population - Age 65+': '65+',
    'Population - Female': 'Female',
    'Population - Male': 'Male',
    'Population - Latinx': 'Latinx',
    'Population - Asian Non-Latinx': 'Asian',
    'Population - Black Non-Latinx':'Black',
    'Population - White Non-Latinx': 'White',
    'Population - Other Race Non-Latinx': 'Other_race'
}

df_chicago.rename(columns=name_dict1,
          inplace=True)

df_chicago.info() # clean data frame

<class 'pandas.core.frame.DataFrame'>
Index: 3255 entries, 0 to 3277
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   zip_code          3255 non-null   int64  
 1   loc_code          3255 non-null   object 
 2   weather           3255 non-null   object 
 3   patrol            3255 non-null   object 
 4   mem_sex           3255 non-null   object 
 5   mem_race          3255 non-null   object 
 6   sub_sex           3255 non-null   object 
 7   sub_race          3255 non-null   object 
 8   hospitalized      3255 non-null   int64  
 9   Geography         3255 non-null   float64
 10  total_population  3255 non-null   float64
 11  0-17              3255 non-null   float64
 12  18-29             3255 non-null   float64
 13  30-39             3255 non-null   float64
 14  40-49             3255 non-null   float64
 15  50-59             3255 non-null   float64
 16  60-69             3255 non-null   float64
 17  

### Seattle

In [12]:
df_sea_raw = pd.read_csv('https://raw.githubusercontent.com/gsheara/SRCC-Project/main/Seattle_UOF.csv')
df_sea_raw.head()

Unnamed: 0,ID,Incident_Num,Incident_Type,Occured_date_time,Precinct,Sector,Beat,Officer_ID,Subject_ID,Subject_Race,Subject_Gender
0,2023UOF-0027-3023-29922,70339,Level 1 - Use of Force,01/01/2023 12:00:00 AM,North,,-,5962,30795,White,Male
1,2023UOF-0115-3052-29925,70341,Level 1 - Use of Force,01/01/2023 12:56:00 AM,East,EDWARD,E2,6049,30798,White,Male
2,2023UOF-0026-1966-29941,70340,Level 1 - Use of Force,01/01/2023 01:20:00 AM,North,,-,1873,30814,Not Specified,Male
3,2023UOF-0028-1886-29915,70338,Level 1 - Use of Force,01/01/2023 01:40:00 AM,-,,-,1833,30788,Not Specified,Male
4,2023UOF-0029-2957-30034,70337,Level 1 - Use of Force,01/01/2023 01:45:00 AM,-,,-,5818,30906,Not Specified,Male


In [13]:
df_sea_raw['Occured_date_time'] = pd.to_datetime(df_sea_raw['Occured_date_time'])

df_sea_raw.loc[df_sea_raw['Subject_Gender']=='U', 'Subject_Gender'] = 'Unknown'
df_sea_raw.loc[df_sea_raw['Subject_Gender']=='-', 'Subject_Gender'] = 'Unknown'
df_sea_raw.loc[df_sea_raw['Subject_Gender']=='Transgender Female', 'Subject_Gender'] = 'Other'
df_sea_raw.loc[df_sea_raw['Subject_Gender']=='Non-binary', 'Subject_Gender'] = 'Other'
df_sea_raw['Subject_Gender'].unique()

df_sea_raw.loc[df_sea_raw['Incident_Type']=='Level 1 - Use of Force', 'Incident_Type'] = 1
df_sea_raw.loc[df_sea_raw['Incident_Type']=='Level 2 - Use of Force', 'Incident_Type'] = 2
df_sea_raw.loc[df_sea_raw['Incident_Type']=='Level 3 - Use of Force', 'Incident_Type'] = 3
df_sea_raw.loc[df_sea_raw['Incident_Type']=='Level 3 - OIS', 'Incident_Type'] = 3
df_sea_raw['Incident_Type'] = df_sea_raw['Incident_Type'].astype(int)

bad_cols_sea = ['Incident_Num', 'Sector', 'Officer_ID', 'Subject_ID', 'Beat']
df_sea_dropped = df_sea_raw.drop(columns=bad_cols_sea)
df_sea_dropped.head()

  df_sea_raw['Occured_date_time'] = pd.to_datetime(df_sea_raw['Occured_date_time'])


Unnamed: 0,ID,Incident_Type,Occured_date_time,Precinct,Subject_Race,Subject_Gender
0,2023UOF-0027-3023-29922,1,2023-01-01 00:00:00,North,White,Male
1,2023UOF-0115-3052-29925,1,2023-01-01 00:56:00,East,White,Male
2,2023UOF-0026-1966-29941,1,2023-01-01 01:20:00,North,Not Specified,Male
3,2023UOF-0028-1886-29915,1,2023-01-01 01:40:00,-,Not Specified,Male
4,2023UOF-0029-2957-30034,1,2023-01-01 01:45:00,-,Not Specified,Male


In [72]:
df_sea_dropped.loc[df_sea_dropped['Precinct'] == '-','Precinct'] = 'Unknown'
df_sea_dropped['Precinct'].unique()
df_sea = df_sea_dropped
df_sea.dtypes

ID                           object
Incident_Type                 int64
Occured_date_time    datetime64[ns]
Precinct                     object
Subject_Race                 object
Subject_Gender               object
dtype: object

In [73]:
df_sea.shape

(1487, 6)

In [74]:
encoded_sea = pd.get_dummies(df_sea, columns = ['Precinct', 'Subject_Race', 'Subject_Gender'], dtype=int)
encoded_sea

Unnamed: 0,ID,Incident_Type,Occured_date_time,Precinct_East,Precinct_North,Precinct_OOJ,Precinct_South,Precinct_Southwest,Precinct_Unknown,Precinct_West,...,Subject_Race_Black or African American,Subject_Race_Hispanic or Latino,Subject_Race_Nat Hawaiian/Oth Pac Islander,Subject_Race_Not Specified,Subject_Race_Two or More Races,Subject_Race_White,Subject_Gender_Female,Subject_Gender_Male,Subject_Gender_Other,Subject_Gender_Unknown
0,2023UOF-0027-3023-29922,1,2023-01-01 00:00:00,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,2023UOF-0115-3052-29925,1,2023-01-01 00:56:00,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,2023UOF-0026-1966-29941,1,2023-01-01 01:20:00,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,2023UOF-0028-1886-29915,1,2023-01-01 01:40:00,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
4,2023UOF-0029-2957-30034,1,2023-01-01 01:45:00,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,2023UOF-1521-3191-32782,1,2023-12-31 04:52:00,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
1483,2023UOF-1522-2442-32782,1,2023-12-31 04:52:00,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
1484,2024UOF-0003-3073-32783,1,2023-12-31 23:04:00,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
1485,2024UOF-0005-3314-32833,1,2023-12-31 23:05:00,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0


In [16]:
encoded_sea.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1487 entries, 0 to 1486
Data columns (total 22 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   ID                                          1487 non-null   object        
 1   Incident_Type                               1487 non-null   int64         
 2   Occured_date_time                           1487 non-null   datetime64[ns]
 3   Precinct_East                               1487 non-null   int64         
 4   Precinct_North                              1487 non-null   int64         
 5   Precinct_OOJ                                1487 non-null   int64         
 6   Precinct_South                              1487 non-null   int64         
 7   Precinct_Southwest                          1487 non-null   int64         
 8   Precinct_Unknown                            1487 non-null   int64         
 9   Precinct

### NOLA

In [17]:
df_nola = pd.read_csv('https://raw.githubusercontent.com/gsheara/SRCC-Project/main/Nola_UOF.csv')
df_nola.head()
bad_cols = ['PIB File Number', 'Originating Bureau', 'Division level', 'Division', 'Unit', 'Working Status', 'Shift',
            'Investigation status', 'Disposition', 'Service Type', 'Subject Arrested', 'Subject Build', 'Subject Height', 'Use of Force Type',
            'Light Condition', 'Weather Condition', 'Subject Influencing Factors', 'Use of Force Effective', 'Distance Between',
            'Subject Arrest Charges', 'Use of Force Reason', 'Officer Injured']
df_nola = df_nola.drop(bad_cols, axis=1)
df_nola = df_nola.dropna()

In [113]:
df_sea.head()

Unnamed: 0,ID,Incident_Type,Occured_date_time,Precinct,Subject_Race,Subject_Gender
0,2023UOF-0027-3023-29922,1,2023-01-01 00:00:00,North,White,Male
1,2023UOF-0115-3052-29925,1,2023-01-01 00:56:00,East,White,Male
2,2023UOF-0026-1966-29941,1,2023-01-01 01:20:00,North,Not Specified,Male
3,2023UOF-0028-1886-29915,1,2023-01-01 01:40:00,Unknown,Not Specified,Male
4,2023UOF-0029-2957-30034,1,2023-01-01 01:45:00,Unknown,Not Specified,Male


## Graphs

In [77]:
print(df_chicago.shape)
print(df_sea.shape)
print(df_nola.shape)

(3255, 32)
(1487, 6)
(452, 11)


In [109]:
base = alt.Chart(df_chicago).mark_bar().encode(
    alt.X('hospitalized:N', title='Subject Hospitalization'),
    alt.Y('count():Q'),
).properties(title='Chicago Distribution of Force Level')

text = base.mark_text(dy=-5).encode(
    text='count():Q'
)

chart2 = base + text
chart2

In [105]:
base = alt.Chart(df_sea).mark_bar().encode(
    alt.X('Incident_Type:N', title='Use of Force Level'),
    alt.Y('count():Q'),
).properties(title='Seattle Distribution of Force Level')

text = base.mark_text(dy=-5).encode(
    text='count():Q'
)

chart2 = base + text
chart2

In [100]:
base = alt.Chart(df_nola).mark_bar().encode(
    alt.X('Use of Force Level'),
    alt.Y('count():Q'),
).properties(title='NOLA Distribution of Force Level')

text = base.mark_text(dy=-10).encode(
    text='count():Q'
)

chart2 = base + text
chart2

In [128]:
base = alt.Chart(df_nola).mark_bar().encode(
    alt.Y('Subject Ethnicity'),
    alt.X('count():Q'),
).properties(title='NOLA Distribution of Race in High Force Level Incidents'
).transform_filter(alt.FieldOneOfPredicate(field='Use of Force Level', oneOf=['L3',  'L4']))

text = base.mark_text(dx=10).encode(
    text='count():Q'
)

chart2 = base + text
chart2

In [127]:
base = alt.Chart(df_sea).mark_bar().encode(
    alt.Y('Subject_Race'),
    alt.X('count():Q'),
).properties(title='Seattle Distribution of Race in High Force Level Incidents'
).transform_filter(alt.FieldOneOfPredicate(field='Incident_Type', oneOf=['2','3']))

text = base.mark_text(dx=10).encode(
    text='count():Q'
)

chart2 = base + text
chart2

In [116]:
df_chicago.head()

Unnamed: 0,zip_code,loc_code,weather,patrol,mem_sex,mem_race,sub_sex,sub_race,hospitalized,Geography,...,5+,18+,65+,Female,Male,Latinx,Asian,Black,White,Other_race
0,60645,SIDEWALK,CLEAR,CAR,M,WHITE,MALE,BLACK,0,60645.0,...,0.913327,0.731255,0.138808,0.494762,0.505238,0.206854,0.148523,0.149491,0.443285,0.051847
1,60638,APARTMENT,CLEAR,OTH,F,WHITE HISPANIC,MALE,WHITE HISPANIC,1,60638.0,...,0.938042,0.757611,0.136654,0.490968,0.509032,0.553011,0.006513,0.026849,0.406995,0.006632
2,60610,STREET,CLEAR,FOOT,M,WHITE,FEMALE,WHITE,0,60610.0,...,0.969662,0.910023,0.150629,0.520815,0.479185,0.049715,0.094739,0.117085,0.697068,0.041394
3,60611,HOTEL / MOTEL,CLEAR,CAR,M,WHITE,MALE,BLACK,0,60611.0,...,0.967681,0.939442,0.212343,0.526677,0.473323,0.06458,0.204734,0.031161,0.67282,0.026706
4,60611,HOTEL / MOTEL,RAIN,CAR,M,WHITE,MALE,BLACK,0,60611.0,...,0.967681,0.939442,0.212343,0.526677,0.473323,0.06458,0.204734,0.031161,0.67282,0.026706


In [125]:
base = alt.Chart(df_chicago).mark_bar().encode(
    alt.Y('sub_race'),
    alt.X('count():Q'),
).properties(title='Chicago Distribution of Race in High Force Level Incidents'
).transform_filter(alt.FieldOneOfPredicate(field='hospitalized', oneOf=['1']))

text = base.mark_text(dx=10).encode(
    text='count():Q'
)

chart2 = base + text
chart2

## Models

### Chicago

Here I use a logistic regression model with the demographic data because of the binary outcome (hospitalized/not hospitalized.) This data is around 58% accurate of prediction versus test.

In [18]:
df_chicago.head()

Unnamed: 0,zip_code,loc_code,weather,patrol,mem_sex,mem_race,sub_sex,sub_race,hospitalized,Geography,...,5+,18+,65+,Female,Male,Latinx,Asian,Black,White,Other_race
0,60645,SIDEWALK,CLEAR,CAR,M,WHITE,MALE,BLACK,0,60645.0,...,0.913327,0.731255,0.138808,0.494762,0.505238,0.206854,0.148523,0.149491,0.443285,0.051847
1,60638,APARTMENT,CLEAR,OTH,F,WHITE HISPANIC,MALE,WHITE HISPANIC,1,60638.0,...,0.938042,0.757611,0.136654,0.490968,0.509032,0.553011,0.006513,0.026849,0.406995,0.006632
2,60610,STREET,CLEAR,FOOT,M,WHITE,FEMALE,WHITE,0,60610.0,...,0.969662,0.910023,0.150629,0.520815,0.479185,0.049715,0.094739,0.117085,0.697068,0.041394
3,60611,HOTEL / MOTEL,CLEAR,CAR,M,WHITE,MALE,BLACK,0,60611.0,...,0.967681,0.939442,0.212343,0.526677,0.473323,0.06458,0.204734,0.031161,0.67282,0.026706
4,60611,HOTEL / MOTEL,RAIN,CAR,M,WHITE,MALE,BLACK,0,60611.0,...,0.967681,0.939442,0.212343,0.526677,0.473323,0.06458,0.204734,0.031161,0.67282,0.026706


In [19]:
target = ['hospitalized']
age_features = ['0-17', '18-29', '30-39',	'40-49',	'50-59',	'60-69',	'70-79',	'80+', '0-4']
race_features = ['Latinx',	'Asian',	'Black',	'White',	'Other_race']
gender_features = ['Female',	'Male']
features = ['0-17', '18-29', '30-39',	'40-49',	'50-59',	'60-69',	'70-79',	'80+', '0-4',
            'Latinx',	'Asian',	'Black',	'White',	'Other_race', 'Female',	'Male']
cat_features = ['loc_code', 'weather', 'patrol', 'mem_sex', 'mem_race', 'sub_sex', 'sub_race']

y = df_chicago[target]
y=np.ravel(y)
X = df_chicago[features]

In [20]:
encoded_train = pd.get_dummies(df_chicago, columns = ['sub_race'], dtype=int)
race_encoded = ['sub_race_AMER INDIAN / ALASKAN NATIVE',	'sub_race_ASIAN / PACIFIC ISLANDER',	'sub_race_BLACK',	'sub_race_BLACK HISPANIC',	'sub_race_HISPANIC',	'sub_race_UNKNOWN / REFUSED',	'sub_race_WHITE',	'sub_race_WHITE HISPANIC']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
X_norm = (X_train - np.nanmean(X_train, axis=0)) / np.nanstd(X_train, axis=0)
X_norm_mean = np.nanmean(X_train, axis=0)
X_norm_std = np.nanstd(X_train, axis=0)
print('Norm mean:', X_norm_mean)
print('Norm std:', X_norm_std)
X_norm.shape

Norm mean: [0.20779019 0.20040402 0.16218332 0.12068044 0.11822174 0.100794
 0.05871027 0.03121602 0.05888142 0.22569789 0.05512974 0.47578948
 0.22008774 0.02329515 0.52070805 0.47929195]
Norm std: [0.0654271  0.06156353 0.05901989 0.0164445  0.01856116 0.02122613
 0.01886393 0.01290401 0.0144268  0.23415553 0.0821582  0.35671457
 0.24571556 0.01473068 0.02774642 0.02774642]


(2180, 16)

In [23]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=16).fit(X_train, y_train)
y_pred = logreg.predict(X_test)


In [24]:
from sklearn import metrics

metrics.confusion_matrix(y_test, y_pred)

array([[622,   9],
       [433,  11]])

Overall, underpredicts negative and about 58.8% accurate.

### Seattle

In [80]:
encoded_sea['Incident_Type'] = encoded_sea['Incident_Type'].astype(int)

In [88]:
target = ['Incident_Type']
features = ['Precinct_East',	'Precinct_North',	'Precinct_OOJ',	'Precinct_South',	'Precinct_Southwest',	'Precinct_Unknown',	'Precinct_West', 'Subject_Race_American Indian/Alaska Native',	'Subject_Race_Asian',
            'Subject_Race_Black or African American',	'Subject_Race_Hispanic or Latino',	'Subject_Race_Nat Hawaiian/Oth Pac Islander',	'Subject_Race_Not Specified',
            'Subject_Race_Two or More Races',	'Subject_Race_White',	'Subject_Gender_Female',	'Subject_Gender_Male', 'Subject_Gender_Other',	'Subject_Gender_Unknown']

race_features = ['Subject_Race_American Indian/Alaska Native',	'Subject_Race_Asian',
            'Subject_Race_Black or African American',	'Subject_Race_Hispanic or Latino',	'Subject_Race_Nat Hawaiian/Oth Pac Islander',	'Subject_Race_Not Specified',
            'Subject_Race_Two or More Races',	'Subject_Race_White']

y = encoded_sea[target]
y=np.ravel(y)
X = encoded_sea[race_features]

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [90]:
logreg = LogisticRegression(random_state=16).fit(X_train, y_train)
y_pred = logreg.predict(X_test)
metrics.confusion_matrix(y_test, y_pred)

array([[380,   0,   0],
       [108,   0,   0],
       [  3,   0,   0]])

### NOLA

In [25]:
df_nola.head()

Unnamed: 0,Date Occurred,Use of Force Level,Officer Race/Ethnicity,Officer Gender,Officer Age,Officer Years of Service,Subject Gender,Subject Ethnicity,Subject Age,Subject Injured,Subject Hospitalized
0,05/03/2023,L1 | L2,Hispanic | White,Male | Male,31 | 41,6 | 5,Male,Black,33,No,No
1,01/08/2023,L1 | L1,Black | Black,Male | Male,25 | 24,5 | 4,Male,Black,29,No,No
2,01/13/2023,L2 | L2 | L2,White | Black | White,Female | Male | Male,33 | 27 | 30,5 | 4 | 3,Male,Black,25,Yes,Yes
3,01/26/2023,L1,Hispanic,Male,58,16,Male,Black,24,No,No
5,06/29/2023,L1,Black,Male,33,5,Male,Black,39,No,No


In [26]:
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L1', 'Use of Force Level'] = 'L1'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1', 'Use of Force Level'] = 'L1'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L1 | L1', 'Use of Force Level'] = 'L1'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L1 | L1 | L1', 'Use of Force Level'] = 'L1'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1', 'Use of Force Level'] = 'L1'

df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L2', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L2', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L1 | L2', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L2', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L2 | L2', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L2 | L1 | L1', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L2 | L1', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L1 | L2', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L1 | L1 | L1', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L2 | L1 | L1 | L1 | L1 | L1 | L2', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L1', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L1 | L1 | L2 | L1 | L1 | L2 | L1 | L1', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L1 | L1', 'Use of Force Level'] = 'L2'

df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L2 | L1 | L1 | L2 | L1', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L1 | L1 | L1 | L1', 'Use of Force Level'] = 'L2'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L2 | L2', 'Use of Force Level'] = 'L2'

df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L2 | L3', 'Use of Force Level'] = 'L3'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L3 | L1', 'Use of Force Level'] = 'L3'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L2 | L3 | L2 | L1 | L1', 'Use of Force Level'] = 'L3'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L2 | L1 | L2 | L3', 'Use of Force Level'] = 'L3'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L3', 'Use of Force Level'] = 'L3'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L3 | L1', 'Use of Force Level'] = 'L3'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L2 | L1 | L3 | L1', 'Use of Force Level'] = 'L3'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L2 | L3 | L1', 'Use of Force Level'] = 'L3'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L1 | L2 | L3', 'Use of Force Level'] = 'L3'

df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L1 | L2 | L3 | L1', 'Use of Force Level'] = 'L3'

df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L4', 'Use of Force Level'] = 'L4'
df_nola.loc[df_nola['Use of Force Level'] == 'L4 | L4 | L1', 'Use of Force Level'] = 'L4'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L1 | L4 | L1 | L1 | L1', 'Use of Force Level'] = 'L4'
df_nola.loc[df_nola['Use of Force Level'] == 'L4 | L1 | L1', 'Use of Force Level'] = 'L4'
df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L4', 'Use of Force Level'] = 'L4'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L4', 'Use of Force Level'] = 'L4'

df_nola.loc[df_nola['Use of Force Level'] == 'L1 | L4 | L1 | L1 | L1 | L1', 'Use of Force Level'] = 'L4'
df_nola.loc[df_nola['Use of Force Level'] == 'L2 | L2 | L4 | L2 | L3',  'Use of Force Level'] = 'L4'

df_nola['Use of Force Level'].unique()

array(['L2', 'L1', 'L4', 'L3'], dtype=object)

In [27]:
encoded_nola = pd.get_dummies(df_nola, columns = ['Subject Ethnicity'], dtype=int)
race_encoded = ['Subject Ethnicity_Black', 'Subject Ethnicity_White', 'Subject Ethnicity_Race-Unknown', 'Subject Ethnicity_Hispanic', 'Subject Ethnicity_Black | Black']
encoded_nola.head()

Unnamed: 0,Date Occurred,Use of Force Level,Officer Race/Ethnicity,Officer Gender,Officer Age,Officer Years of Service,Subject Gender,Subject Age,Subject Injured,Subject Hospitalized,...,Subject Ethnicity_Black | Black | Black | Black,Subject Ethnicity_Black | Black | Black | Black | Black | Black,Subject Ethnicity_Black | White,Subject Ethnicity_Black | White | Black,Subject Ethnicity_Hispanic,Subject Ethnicity_Race-Unknown,Subject Ethnicity_Race-Unknown | Black,Subject Ethnicity_White,Subject Ethnicity_White | Black,Subject Ethnicity_White | White | White
0,05/03/2023,L2,Hispanic | White,Male | Male,31 | 41,6 | 5,Male,33,No,No,...,0,0,0,0,0,0,0,0,0,0
1,01/08/2023,L1,Black | Black,Male | Male,25 | 24,5 | 4,Male,29,No,No,...,0,0,0,0,0,0,0,0,0,0
2,01/13/2023,L2,White | Black | White,Female | Male | Male,33 | 27 | 30,5 | 4 | 3,Male,25,Yes,Yes,...,0,0,0,0,0,0,0,0,0,0
3,01/26/2023,L1,Hispanic,Male,58,16,Male,24,No,No,...,0,0,0,0,0,0,0,0,0,0
5,06/29/2023,L1,Black,Male,33,5,Male,39,No,No,...,0,0,0,0,0,0,0,0,0,0


In [93]:
target = ['Use of Force Level']

y = encoded_nola[target]
y=np.ravel(y)
X = encoded_nola[race_encoded]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
logreg = LogisticRegression(random_state=16).fit(X_train, y_train)
y_pred = logreg.predict(X_test)
metrics.confusion_matrix(y_test, y_pred)

array([[104,   0,   0,   0],
       [ 36,   0,   0,   0],
       [  7,   0,   0,   0],
       [  3,   0,   0,   0]])