# Bias score
## Goal
The goal here is to determined *how* the characteristics of an officer influence its bias towards minorities 

## How
- An officer : arrest over a N-year period
- Raw bias score for officer o towards minority m $ B(o,m) = \frac{N_{stops~of~m}}{N_{total~stops~over~the~period}} $
- Strong assumption: $median(\{B(o,m) / o \in S\})$ is actually the proportion of the population of $S$ which is from minority $m$ 
- Bias score for minority $m$ for an officer in a set of officers of region $S$ : $ B_m = \frac{N_{stops~of~m}}{N_{total~stops~over~the~period}} - median(\{B(o,m) / o \in S\})$

The study is held this way : 
- $S$ is statewide : see the coefficient statewide
- $S$ is county-wide, provided enough data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from copy import copy

from tqdm import tqdm
tqdm.pandas()

import statsmodels.api as sm
import statsmodels.formula.api as smf

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.express as px
import plotly.graph_objs as go
init_notebook_mode(connected=True)


In [None]:
folder = 'data/'
# state = folder + 'fl_statewide.csv.zip'
# state_reduced = folder + 'fl_statewide_reduced.csv.zip'
state = folder + 'long_beach.zip'
state_reduced = folder + 'long_beach.csv.zip'

keep_columns = ['date', 'county_name', 'subject_age', 'subject_race', 'subject_sex', 'officer_id_hash', 'officer_age', 'officer_race', 'officer_sex', 'officer_years_of_service', 'arrest_made', 'citation_issued', 'warning_issued', 'frisk_performed', 'search_conducted', 'type']
mandatory_columns = ['date', 'subject_age', 'subject_race', 'subject_sex', 'officer_id_hash', 'officer_age', 'officer_race', 'officer_sex', 'officer_years_of_service']
minorities = ['white', 'hispanic', 'black']
boolean_columns = ['arrest_made', 'citation_issued', 'warning_issued', 'frisk_performed', 'search_conducted']

# Helper functions

In [None]:
def type_booleans(df):
    """ type a column in boolean if possible to reduce size and handability of dataframe
    """
    global boolean_columns
    for col in boolean_columns:
        if col not in df.columns:
            continue
        if df[col].isna().sum() == 0:
            df[col] = df[col].astype(bool)
        else:
            print(f"Cannot convert {col} columns to boolean")

def print_info_df(df):
    """ print summary of dataframe and values in columns if not too long
    """
    print("Dataset is composed of {} stops. Columns are: \n".format(df.shape[0]))
    for col in df.columns:
        if df.dtypes[col] != np.float64:
            val = df[col].unique()
            if len(val) > 20:
                print('{} \t\t: too much different values'.format(col))
            else:
                print('{} \t\t: values are: {}'.format(col if len(col)>15 else col + "\t\t", val))

def generate_smaller_data(df, keep_ratio, path):
    """ Write new csv of reduced size in path and returns it
    """
    total_size = df.shape[0]
    df_red = df.sample(n=int(total_size * keep_ratio))
    df_red.to_csv(path, index = False)
    return df_red

# Loading and cleaning dataset

In [None]:
# load all dataset
df_full = pd.read_csv(state)
print(df_full.columns)
print(len(df_full))

In [None]:
df_full

In [None]:
df_full.drop(columns=df_full.columns.difference(keep_columns), inplace=True) # drop unused columns
df_full.dropna(subset=mandatory_columns, how='any', inplace=True) # drop nan values in mandatory columns
df_full['date'] = pd.to_datetime(df_full['date']) # to datetime
df_full['year'] = df_full['date'].dt.to_period('y')
df_full = df_full.rename(columns={'officer_years_of_service': 'officer_yos'})

print(len(df_full))
df_full.head()

In [None]:
df_small = generate_smaller_data(df_full, 0.01, state_reduced)
print(len(df_small))

# Explore the data

In [None]:
# grasp on the data : is there spelling mistakes for gender, race; nan values; ...
print_info_df(df_full)

In [None]:
# keep same minorities as papers 
# df_full = df_full[df_full['subject_race'].isin(minorities)]
df_full = df_full[df_full['officer_race'].isin(minorities)]
type_booleans(df_full)

## Compute dataframe officers

In [None]:
df_officers = df_full.groupby(['year', 'officer_id_hash', 'officer_race', 'officer_sex'])['officer_yos', 'officer_age']
df_officers = df_officers.min().reset_index()
df_officers.set_index(['year', 'officer_id_hash'], inplace=True, verify_integrity=True)
df_officers.head()

# Compute bias score


In [None]:
# build dataframe linking (year, officer) to their number of arrest of minorities
df_yearly_all = df_full.groupby(['year', 'officer_id_hash', 'subject_race'])['date'].count().to_frame().reset_index()
df_yearly_all.rename(columns={'date':'stops'}, inplace=True)
df_yearly_all = df_yearly_all.pivot_table(columns='subject_race', values='stops', index=['year', 'officer_id_hash'], fill_value=0)
df_yearly_all['total'] = df_yearly_all.sum(axis=1)
df_yearly_all.head()

In [None]:
# drop officers with too few arrest
stop_threshold = 100
df_yearly = df_yearly_all[df_yearly_all['total'] > stop_threshold]
print(f'There are {len(df_yearly)} entries left')

In [None]:
# add raw bias
for m in minorities:
        df_yearly[f'raw_bias_{m}'] = df_yearly[m] / df_yearly['total']

# compute medians
raw_bias_medians = { m : df_yearly[f'raw_bias_{m}'].median() for m in minorities}

# add bias
for m in minorities:
    df_yearly[f'bias_{m}'] = df_yearly[f'raw_bias_{m}'] - raw_bias_medians[m]

df_yearly.head()

In [None]:
bias_df = df_yearly.merge(df_officers, how='left', left_index=True, right_index=True, validate='one_to_one')
bias_df.head(1)

In [None]:
# fit models
for m in minorities:
    print()
    print(f'--------------{m.upper()}--------------')
    res = smf.ols(formula=f'bias_{m} ~ C(officer_race) + C(officer_sex) + officer_age', data=bias_df).fit()
    print(res.summary())

### Analysis
- The results are the opposite of what was expected : the bias score toward a minority m increases when the officer is of the same race
- The age seems to diminish the bias score of the officer

**Conclusion**
- The assumption "median of raw biases toward m = proportion of m in the local population" does not work. In localities with lots of hispanics, 


## Study county-wide 

In [None]:
# number of counties in which an officer appears
county_per_officer = df_full.groupby('officer_id_hash')['county_name'].nunique()
sns.histplot(county_per_officer)

In [None]:
fig = px.histogram(data_frame=county_per_officer, x='county_name')
fig.show()

### Observations
All officers have more than 1 county

# Study of walking arrest (if any)

In [None]:
print(df_full['type'].unique())
df_ped = df_full[df_full['type'] == 'pedestrian']
print(len(df_ped))

In [None]:
df_ped

In [None]:
df_officers_ped = df_ped.groupby(['year', 'officer_id_hash', 'officer_race', 'officer_sex'])['officer_yos', 'officer_age']
df_officers_ped = df_officers_ped.min().reset_index()
df_officers_ped.set_index(['year', 'officer_id_hash'], inplace=True, verify_integrity=True)
df_officers_ped.head()

# build dataframe linking (year, officer) to their number of arrest of minorities
df_yearly_ped_all = df_ped.groupby(['year', 'officer_id_hash', 'subject_race'])['date'].count().to_frame().reset_index()
df_yearly_ped_all.rename(columns={'date':'stops'}, inplace=True)
df_yearly_ped_all = df_yearly_ped_all.pivot_table(columns='subject_race', values='stops', index=['year', 'officer_id_hash'], fill_value=0)
df_yearly_ped_all['total'] = df_yearly_ped_all.sum(axis=1)
df_yearly_ped_all.head()

# drop officers with too few arrest
stop_threshold = 10
df_yearly_ped = df_yearly_ped_all[df_yearly_ped_all['total'] > stop_threshold]
print(f'There are {len(df_yearly_ped)} entries left')

# add raw bias
for m in minorities:
        df_yearly_ped[f'raw_bias_{m}'] = df_yearly_ped[m] / df_yearly_ped['total']

# compute medians
raw_bias_medians_ped = { m : df_yearly_ped[f'raw_bias_{m}'].median() for m in minorities}

# add bias
for m in minorities:
    df_yearly_ped[f'bias_{m}'] = df_yearly_ped[f'raw_bias_{m}'] - raw_bias_medians_ped[m]

bias_df_ped = df_yearly_ped.merge(df_officers_ped, how='left', left_index=True, right_index=True, validate='one_to_one')

# fit models
for m in minorities:
    print()
    print(f'--------------{m.upper()}--------------')
    res = smf.ols(formula=f'bias_{m} ~ C(officer_race) + C(officer_sex) + officer_age', data=bias_df).fit()
    print(res.summary())