# Data analysis :  Gender disparities in police stops accross the United States


In [29]:
# Libraries needed 
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
from pathlib import Path

import collections 
from collections import Counter

## Data wrangling
- Datasets choice and loading
- Filtering of incomplete rows and unnecessary features
- Formatting

The datasets handled are : 
- Set 1 : NC (Charlotte, Greensboro, Raleigh), Austin (TX), CA (San Diego), Maryland state (MD)
- Set 2 : NC (Charlotte, Greensboro, Raleigh), CA (San Diego and San Francisco), Nashville (TN), New Orleans (Louisiana), Maryland state (MD)
- Case study : Florida state (FL)

Also available : Washington state (WA) with officers' gender, Louisville (Kentucky), and Pittsburgh (Philadelphia) for female police officer fraction computation. 

For the different analysis of our study, there are mandatory features that must appear in each dataset in order to be able to fulfill our analysis. These are listed before each section in this notebook. Remaining interesting features are let in the filtered datasets, for potential data description. 
Some extra effort was moreover given to find various state and cities accross the USA that differ geographically, and on their potential sexist bias. Indeed, sexism could differ from state to state (which is one of the research question of this work), hence the datasets are commented by an indication on whether the state is a priori sexist or not, following https://www.chicagotribune.com/nation-world/ct-america-most-sexist-places-20180821-story.html. 

### For set 1 : 
    - age, gender, race
    - reason for search + search_person and search_vehicle
    - contraband_found
    
    
North Carolina : Charlotte, Greensboro and Raleigh - very sexist

In [None]:
# North Carolina 
df_charlotte = pd.read_csv('data_P4/yg821jf8611_nc_charlotte_2020_04_01.csv.zip', compression='zip')
df_raleigh = pd.read_csv('data_P4/yg821jf8611_nc_raleigh_2020_04_01.csv.zip', compression='zip')
df_greensboro = pd.read_csv('data_P4/yg821jf8611_nc_greensboro_2020_04_01.csv.zip', compression='zip')
name = ['Charlotte','Raleigh','Greensboro']

for idx,df in enumerate([df_charlotte, df_raleigh, df_greensboro]):
    print('\n For : '+str(name[idx]))
    print(set(df.columns))
    print('Dimensions of dataframe before filtering :'+str(df.shape))
    
    # filtering 
    df.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search'], inplace=True)
    df.drop(df.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop']), 1, inplace=True)
    
    print('Dimensions of dataframe after :'+str(df.shape))
    print(df.head())

San Diego (California) - not so sexist
- without frisk information

In [None]:
df_sandiego = pd.read_csv('data_P4/without_frisk_performed/yg821jf8611_ca_san_diego_2020_04_01.csv.zip', compression='zip')

print('\n For : San Diego')
print(set(df_sandiego.columns))
print('Dimensions of dataframe before filtering :'+str(df_sandiego.shape))
#filtering
df_sandiego.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search'], inplace=True)
df_sandiego.drop(df_sandiego.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop']), 1, inplace=True)
print('Dimensions of dataframe after :'+str(df_sandiego.shape))
print(df_sandiego.head())

Austin (TX) - very sexist

In [30]:
df_austin = pd.read_csv('data_P4/yg821jf8611_tx_austin_2020_04_01.csv.zip', compression='zip')

print('\n For : San Diego')
print(set(df_austin.columns))
print('Dimensions of dataframe before filtering :'+str(df_austin.shape))
#filtering
df_austin.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'raw_person_search_search_based_on'], inplace=True)
df_austin.drop(df_austin.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'raw_person_search_search_based_on', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop']), 1, inplace=True)
print('Dimensions of dataframe after :'+str(df_austin.shape))
#formatting
df_austin = df_austin.rename(columns={'raw_person_search_search_based_on': 'reason_for_search'})
print(df_austin.head())


 For : San Diego
{'raw_vehicle_search_search_based_on', 'raw_vehicle_searched', 'raw_street_check_description', 'subject_race', 'vehicle_registration_state', 'vehicle_model', 'reason_for_stop', 'raw_person_search_search_discovered', 'subject_sex', 'contraband_drugs', 'search_conducted', 'search_vehicle', 'raw_person_search_search_based_on', 'officer_id_hash', 'raw_race_description', 'frisk_performed', 'raw_person_searched', 'raw_row_number', 'date', 'type', 'contraband_weapons', 'vehicle_year', 'vehicle_make', 'subject_age', 'raw_vehicle_search_search_discovered', 'search_basis', 'search_person', 'contraband_found', 'raw_ethnicity'}
Dimensions of dataframe before filtering :(483255, 29)
Dimensions of dataframe after :(15039, 9)
     subject_age subject_race subject_sex contraband_found  frisk_performed  \
406         19.0        black        male             True            False   
416         27.0        black        male            False             True   
420         56.0     his

Maryland - least sexist state

In [None]:
df_MD = pd.read_csv('data_P4/without_frisk_performed/yg821jf8611_md_statewide_2020_04_01.csv.zip', compression='zip')

print('\n For : Maryland')
print(set(df_MD.columns))
print('Dimensions of dataframe before filtering :'+str(df_MD.shape))
#filtering
df_MD.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search'], inplace=True)
df_MD.drop(df_MD.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop']), 1, inplace=True)
print('Dimensions of dataframe after :'+str(df_MD.shape))
print(df_MD.head())

### For set 2 :
    - age, gender, race
    - reason for stop + contraband found
    - citation OR warning OR arrest

San Francisco (California) - not so sexist

- without frisk information

In [None]:
df_sanfrancisco = pd.read_csv('data_P4/without_frisk_performed/yg821jf8611_ca_san_francisco_2020_04_01.csv.zip', compression='zip')

print('\n For San Francisco')
print(set(df_sanfrancisco.columns))
print('Dimensions of dataframe before filtering :'+str(df_sanfrancisco.shape))
#filtering 
df_sanfrancisco.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'reason_for_stop'], inplace=True)
df_sanfrancisco.drop(df_sanfrancisco.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop']), 1, inplace=True)
print('Dimensions of dataframe after :'+str(df_sanfrancisco.shape))
print(df_sanfrancisco.head())    

Nashville (Tennessee) - very sexist

In [None]:
df_nashville = pd.read_csv('data_P4/yg821jf8611_tn_nashville_2020_04_01.csv.zip', compression='zip')

print('\n For Nashville (TN)')
print(set(df_nashville.columns))
print('Dimensions of dataframe before filtering :'+str(df_nashville.shape))
#filtering 
df_nashville.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'reason_for_stop'], inplace=True)
df_nashville.drop(df_nashville.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop']), 1, inplace=True)
print('Dimensions of dataframe after :'+str(df_nashville.shape))
print(df_nashville.head())

New Orleans (Lousiana) - sexist

In [None]:
df_neworleans = pd.read_csv('data_P4/yg821jf8611_la_new_orleans_2020_04_01.csv.zip', compression='zip')

print('\n For New Orleans (Louisiana)')
print(set(df_neworleans.columns))
print('Dimensions of dataframe before filtering :'+str(df_neworleans.shape))
#filtering 
df_neworleans.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'reason_for_stop'], inplace=True)
df_neworleans.drop(df_neworleans.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop']), 1, inplace=True)
print('Dimensions of dataframe after :'+str(df_neworleans.shape))
print(df_neworleans.head())

### For the case study : 

Florida state - sexist
- no information on contraband
- officer's gender

In [None]:
df_FL = pd.read_csv('data_P4/yg821jf8611_fl_statewide_2020_04_01.csv.zip', compression='zip')

print(set(df_FL.columns))
print('Dimensions of dataframe :'+str(df_FL.shape))
                            
print('Male officers :'+format(len(df_FL[df_FL['officer_sex']=='male'])))
print('Female officers :'+format(len(df_FL[df_FL['officer_sex']=='female'])))
print('Fraction of women officers stop record before filtering : '+str(326525/4352689))

# filtering
df_FL.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'officer_sex', 'violation', 'reason_for_stop'], inplace=True)
df_FL.drop(df_FL.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'violation', 'officer_sex', 'search_person', 'search_vehicle', 'reason_for_search', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop']), 1, inplace=True)

### Other available datasets : 

Washington state - middle sexist
 - contains officer's gender 
 - does not contain reason for search nor stop

In [None]:
# Washington state - middle
df_WA = pd.read_csv('data_P4/yg821jf8611_wa_statewide_2020_04_01.csv.zip', compression='zip')
# contains officers' gender

print(set(df_WA.columns))
print('Male officers :'+format(len(df_WA[df_WA['officer_sex']=='male'])))
print('Female officers :'+format(len(df_WA[df_WA['officer_sex']=='female'])))
print('Dimensions of dataframe :'+str(df_WA.shape))
df_WA.dropna(subset=['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_conducted'], inplace=True)
df_WA.drop(df_WA.columns.difference(['subject_age', 'subject_race', 'subject_sex', 'contraband_found', 'search_person', 'search_vehicle', 'reason_for_search', 'citation_issued','warning_issued','arrest_made', 'frisk_performed', 'reason_for_frisk', 'reason_for_stop', 'officer_sex' ]), 1, inplace = True)
                                     
print('Male officers :'+format(len(df_WA[df_WA['officer_sex']=='male'])))
print('Female officers :'+format(len(df_WA[df_WA['officer_sex']=='female'])))
print('Dimensions of dataframe :'+str(df_WA.shape))
print('Fraction of women officers stop record before filtering : '+str(721258/10612167)+' fraction after : '+str(13757/225840))
df_WA.head()

The following are loaded just to show how few female police officer there are : between 4 and 9% of the stop were done by women, based on calculations for Louisville (KY), Pittsburgh (Philadelphia), Washington (state).

In [None]:
# Louisville, KY : sexist 
df_louisville = pd.read_csv('data_P4/yg821jf8611_ky_louisville_2020_04_01.csv.zip', compression='zip')
print(set(df_louisville.columns))
print('Dimensions of dataframe :'+str(df_louisville.shape))

print('Male officers :'+format(len(df_louisville[df_louisville['officer_sex']=='male'])))
print('Female officers :'+format(len(df_louisville[df_louisville['officer_sex']=='female'])))

print('Fraction of women officers stop record before filtering : '+str(4870/105934))

In [None]:
# Pittsburgh, Philadelphia : middle 
df_PA_P = pd.read_csv('data_P4/yg821jf8611_pa_pittsburgh_2020_04_01.csv.zip', compression='zip')
print(set(df_PA_P.columns))
print('Dimensions of dataframe :'+str(df_PA_P.shape))
                            
print('Male officers :'+format(len(df_PA_P[df_PA_P['officer_sex']=='male'])))
print('Female officers :'+format(len(df_PA_P[df_PA_P['officer_sex']=='female'])))

print('Fraction of women officers stop record before filtering : '+str(16503/197809))

To save the filtered datasets : 

In [None]:
DF = [df_charlotte, df_raleigh, df_greensboro, df_austin, df_neworleans, df_sandiego, df_sanfrancisco, df_nashville, df_WA, df_MD, df_FL]
name = ['df_charlotte', 'df_raleigh', 'df_greensboro', 'df_austin', 'df_neworleans', 'df_sandiego', 'df_sanfrancisco', 'df_nashville', 'df_WA', 'df_MD', 'df_FL']

for idx,df in enumerate(DF):
    df.to_csv('data_P4/'+name[idx]+'.csv', sep=',', encoding='utf-8')

## Set 1 : Gender disparities in search decisions

The aim of this first part is to compare for W-M pairs the difference in search decision. The following method is applied :
-  compare only individuals presenting similar subjective signals (except gender) to the police officer. For this, the following features are taken into account : 
     - match only person of same race
     - match only person of different gender
     - match only person with the same age range : this one is subjectively determined by assuming that only female under 45 could be attractive enough to generate a significant difference in decision outcome  
     - match only person with the same behaviour during the traffic stop (feature : 'reason_for_search') 
- Compute difference in search (potentially also frisk) decision + confidence intervals
- Fisher exact test to assess if the hit rates are significantly different between W and M. If the difference is not significant, it is no warranted to search a gender more than the other, so the difference in search decision is not justified and we conclude that there is gender discrimination.

Citation and arrest are considered as severe outcome deicisons (1), whereas warning is considered as less severe (0). There is a difference between a person's search and a frisk : the former is more intrusive than the latter (see https://www.carrolltrobermanlaw.com/blog/2018/september/whats-the-difference-between-a-search-and-a-fris/). 


If available, the following information are provided in each dataframe :

    'subject_age'
    'subject_race'
    'subject_sex'
    'contraband_found'
    'search_person'
    'search_vehicle'
    'reason_for_search'
    'citation_issued'
    'warning_issued'
    'arrest_made'
    'frisk_performed'
    'reason_for_frisk'
    'reason_for_stop'
    'officer_sex' (only for WA, FL and Louisville)



Generic functions : 

In [34]:
def DF_handling(df):
    # create a index column
    df['id']= df.index
    
    # Age category (proxy for attractiveness)
    df['attractiveness'] = df.apply(lambda x: True if (x.subject_age<46) else False, axis=1)
    
    return df

def match_pairs(df, reason_searches):
    # match pairs on race, age range, contraband found (1:yes or 0:no), and search reason
    matching=pd.DataFrame({'men':[],'women':[]})
    AOD = 0
    # Match 1 woman and 1 man with same attributes
    for reason in reason_searches:
        for race in ('white','black','hispanic'):
            for attractiveness in ('True', 'False'):
                for contraband in ('True', 'False'):
                    for s_vehicle in ('True', 'False'):
                        query = "reason_for_search=='"+reason+"' & search_vehicle=="+s_vehicle+" & subject_race=='"+race+"' & attractiveness=="+attractiveness+" & contraband_found=="+contraband
                        data = df.query(query).copy()
                        id_M = np.array(data[data['subject_sex']=='male'].id)
                        id_W = np.array(data[data['subject_sex']=='female'].id)
                        if ((len(id_M)>0) & (len(id_W)>0)):
                            if (len(id_M)>len(id_W)):
                                for i in np.arange(0,len(id_W)):
                                    matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                              
                            if (len(id_M)<len(id_W)):
                                for i in np.arange(0,len(id_M)):
                                    matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
    return matching

def AOD_s(df, matching):
    # Averaged outcome difference (AOD) in search decision s 
    N_matches = len(matching)
    diff = np.zeros(N_matches)

    for i in np.arange(0,N_matches):
        s_man = int(np.array(df[df['id']==matching.loc[i,'men']].search_person))
        s_woman = int(np.array(df[df['id']==matching.loc[i,'women']].search_person))
        diff[i] = s_man-s_woman                    
    AOD = diff.sum()/N_matches    
    
    return AOD, diff

def bootstrap_CI(data, nbr_draws):
    # Confidence interval
    # Bootstrap CI function from exercise session 2
    means = np.zeros(nbr_draws)
    data = np.array(data)

    for n in range(nbr_draws):
        indices = np.random.randint(0, len(data), len(data))
        data_tmp = data[indices] 
        means[n] = np.nanmean(data_tmp)

    return [np.nanpercentile(means, 5),np.nanpercentile(means, 95)]

In [32]:
def state(df, n):
    # - compute the reason distribution in order to determine n, the number of reasons that are kept for the analysis.
    # - match the pairs, compute AOD and interval of confidence
    
    df = DF_handling(df)

    # Feature analysis : reason for search
    Counter(df['reason_for_search'])

    # Plot of the possible combination of the reason for search
    plt.figure(figsize=[18,8])
    plt.bar(Counter(df.dropna(subset=['reason_for_search']).reason_for_search).keys(), Counter(df.dropna(subset=['reason_for_search']).reason_for_search).values())

    print('There are %d combinations of reasons for the policeman to decide to operate a search, among which, the most represented combinations are (in descending order of importance) :' %len(Counter(df['reason_for_search']).items()))
    print(list(sorted(Counter(df.dropna(subset=['reason_for_search']).reason_for_search).items(), key=lambda x:x[1],  reverse=True))[:13])

    # Possible reasons for search : n combinations chosen
    reason_searches = sorted(Counter(df['reason_for_search']), key=Counter(df['reason_for_search']).get, reverse = True)[:n]
    
    matching = match_pairs(df, reason_searches)

    [AOD, diff] = AOD_s(df, matching)

    CI = bootstrap_CI(diff, 1000)

    print('Averaged outcome difference %.4f with [%.4f, %.4f] 95%% confidence intervals' %(AOD, CI[0], CI[1]))

In [None]:
# Compute the AOD and interval of confidence for each desired state or city
state(df_charlotte, n=3)
state(df_greensboro, n=4)
state(df_raleigh, n=3)
state(df_MD, n=3)
state(df_sandiego, n=4)
state(df_austin, n=2)

## Set 2 : Gender disparities in stop outcome decisions


## Case study : Gender disparities in Florida state policing 

In [None]:
# Formatting
df_FL['id']=df_FL.index