# Data analysis 2: Gender disparities stops outcome

In [1]:
# Libraries

import numpy as np
import pandas as pd
import collections
from collections import Counter
import matplotlib.pyplot as plt


In [2]:
# Import data
df_charlotte = pd.read_csv('data/df_charlotte.csv')
df_greensboro = pd.read_csv('data/df_greensboro.csv')
df_nashville = pd.read_csv('data/df_nashville.csv')
df_neworleans = pd.read_csv('data/df_neworleans.csv')
df_raleigh = pd.read_csv('data/df_raleigh.csv')
df_sandiego = pd.read_csv('data/df_sandiego.csv')
df_sanfrancisco = pd.read_csv('data/df_sanfrancisco.csv')
df_WA = pd.read_csv('data/df_WA.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## NC Charlotte

In [4]:
df_charlotte = df_charlotte.rename(columns={'Unnamed: 0':'id'})

In [5]:
# Age categorie (proxy for attractiveness)
df_charlotte['attractiveness'] = df_charlotte.apply(lambda x: True if (x.subject_age<46) else False, axis=1)

In [6]:
# Outcome value: arrest, citation = 1 and warning = 0
df_charlotte['outcome'] = df_charlotte.apply(lambda x: 1 if (x.arrest_made or x.citation_issued) else 0, axis=1)
df_charlotte.head(2)

Unnamed: 0,id,subject_age,subject_race,subject_sex,arrest_made,citation_issued,warning_issued,contraband_found,frisk_performed,search_person,search_vehicle,reason_for_frisk,reason_for_search,reason_for_stop,attractiveness,outcome
0,81,21.0,black,male,True,False,False,True,False,True,True,,Other Official Information,Vehicle Equipment Violation,True,1
1,82,21.0,black,male,True,False,False,False,False,True,True,,Other Official Information,Investigation,True,1


In [7]:
# Possible reasons for stop
reason_stops = list(Counter(df_charlotte['reason_for_stop']).keys())
Counter(df_charlotte['reason_for_stop'])

Counter({'Vehicle Equipment Violation': 14280,
         'Investigation': 12125,
         'Other Motor Vehicle Violation': 7564,
         'Driving While Impaired': 3912,
         'Vehicle Regulatory Violation': 27893,
         'Speed Limit Violation': 10800,
         'Safe Movement Violation': 6156,
         'Stop Light/Sign Violation': 4806,
         'Seat Belt Violation': 5028,
         'Checkpoint': 314})

We find 10 different reason for stops that cannot combine. We keep them all.

In [8]:
matching=pd.DataFrame({'men':[],'women':[]})
AOD = 0
# Match 1 woman and 1 man with same attributes
for reason in reason_stops:
    for race in ('white','black','hispanic'):
        for attractiveness in ('True', 'False'):
            for contraband in ('True', 'False'):
                query = "reason_for_stop=='"+reason+"' & subject_race=='"+race+"' & attractiveness=="+attractiveness+" & contraband_found=="+contraband
                data = df_charlotte.query(query).copy()
                id_M = np.array(data[data['subject_sex']=='male'].id)
                id_W = np.array(data[data['subject_sex']=='female'].id)
                if ((len(id_M)>0) & (len(id_W)>0)):
                    if (len(id_M)>len(id_W)):
                        for i in np.arange(0,len(id_W)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                          
                    if (len(id_M)<len(id_W)):
                        for i in np.arange(0,len(id_M)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                       
matching.head(5)                       

Unnamed: 0,men,women
0,969.0,10634.0
1,1941.0,19204.0
2,2327.0,19579.0
3,5978.0,21292.0
4,17379.0,36322.0


In [9]:
# Averaged outcome difference
N_matches = len(matching)
diff = np.zeros(N_matches)
for i in np.arange(0,N_matches):
    outcome_man = np.array(df_charlotte[df_charlotte['id']==matching.loc[i,'men']].outcome)
    outcome_woman = np.array(df_charlotte[df_charlotte['id']==matching.loc[i,'women']].outcome)
    diff[i] = outcome_man-outcome_woman                    
AOD = diff.sum()/N_matches    


In [10]:
# Confidence interval

# Bootstrap CI function from exercise session 2
def bootstrap_CI(data, nbr_draws):
    means = np.zeros(nbr_draws)
    data = np.array(data)

    for n in range(nbr_draws):
        indices = np.random.randint(0, len(data), len(data))
        data_tmp = data[indices] 
        means[n] = np.nanmean(data_tmp)

    return [np.nanpercentile(means, 5),np.nanpercentile(means, 95)]

CI = bootstrap_CI(diff, 1000)

print('Averaged outcome difference %.4f with [%.4f, %.4f] 95%% confidence intervals' %(AOD, CI[0], CI[1]))



Averaged outcome difference 0.0245 with [0.0164, 0.0326] 95% confidence intervals


## TN Nashville

In [11]:
df_nashville = df_nashville.rename(columns={'Unnamed: 0':'id'})
# Age categorie (proxy for attractiveness)
df_nashville['attractiveness'] = df_nashville.apply(lambda x: True if (x.subject_age<46) else False, axis=1)
# Outcome value: arrest, citation = 1 and warning = 0
df_nashville['outcome'] = df_nashville.apply(lambda x: 1 if (x.arrest_made or x.citation_issued) else 0, axis=1)
df_nashville.head(2)

Unnamed: 0,id,subject_age,subject_race,subject_sex,arrest_made,citation_issued,warning_issued,contraband_found,reason_for_stop,attractiveness,outcome
0,4,21.0,black,male,False,False,True,False,vehicle equipment violation,True,0
1,26,49.0,black,male,True,True,False,True,vehicle equipment violation,False,1


In [12]:
# Possible reasons for stop
reason_stops = list(Counter(df_nashville['reason_for_stop']).keys())
Counter(df_nashville['reason_for_stop'])

Counter({'vehicle equipment violation': 42376,
         'investigative stop': 8602,
         'moving traffic violation': 54808,
         'seatbelt violation': 5232,
         'registration': 7945,
         'safety violation': 7098,
         'parking violation': 301,
         'child restraint': 49})

We find 8 different reason for stops that cannot combine. We keep them all.

In [13]:
matching=pd.DataFrame({'men':[],'women':[]})
AOD = 0
# Match 1 woman and 1 man with same attributes
for reason in reason_stops:
    for race in ('white','black','hispanic'):
        for attractiveness in ('True', 'False'):
            for contraband in ('True', 'False'):
                query = "reason_for_stop=='"+reason+"' & subject_race=='"+race+"' & attractiveness=="+attractiveness+" & contraband_found=="+contraband
                data = df_nashville.query(query).copy()
                id_M = np.array(data[data['subject_sex']=='male'].id)
                id_W = np.array(data[data['subject_sex']=='female'].id)
                if ((len(id_M)>0) & (len(id_W)>0)):
                    if (len(id_M)>len(id_W)):
                        for i in np.arange(0,len(id_W)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                          
                    if (len(id_M)<len(id_W)):
                        for i in np.arange(0,len(id_M)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
matching.head(5)                       

Unnamed: 0,men,women
0,197.0,696.0
1,1561.0,6801.0
2,3701.0,8070.0
3,4922.0,9394.0
4,5814.0,9890.0


In [20]:
# Averaged outcome difference
N_matches = len(matching)
diff = np.zeros(N_matches)
for i in np.arange(0,N_matches):
    outcome_man = np.array(df_nashville[df_nashville['id']==matching.loc[i,'men']].outcome)
    outcome_woman = np.array(df_nashville[df_nashville['id']==matching.loc[i,'women']].outcome)
    diff[i] = outcome_man-outcome_woman                    
AOD = diff.sum()/N_matches    


In [21]:
# Confidence interval

CI = bootstrap_CI(diff, 1000)

print('Averaged outcome difference %.4f with [%.4f, %.4f] 95%% confidence intervals' %(AOD, CI[0], CI[1]))


Averaged outcome difference 0.0227 with [0.0164, 0.0285] 95% confidence intervals


## LA New Orleans

In [22]:
df_neworleans = df_neworleans.rename(columns={'Unnamed: 0':'id'})
# Age categorie (proxy for attractiveness)
df_neworleans['attractiveness'] = df_neworleans.apply(lambda x: True if (x.subject_age<46) else False, axis=1)
# Outcome value: arrest, citation = 1 and warning = 0
df_neworleans['outcome'] = df_neworleans.apply(lambda x: 1 if (x.arrest_made or x.citation_issued) else 0, axis=1)
df_neworleans.head(2)

Unnamed: 0,id,subject_age,subject_race,subject_sex,arrest_made,citation_issued,warning_issued,contraband_found,reason_for_stop,attractiveness,outcome
0,380,27.0,black,male,False,False,True,False,TRAFFIC VIOLATION,True,0
1,414,18.0,black,male,True,False,False,True,CRIMINAL VIOLATION,True,1


In [23]:
# Possible reasons for stop
reason_stops = list(Counter(df_neworleans['reason_for_stop']).keys())
Counter(df_neworleans['reason_for_stop'])

Counter({'TRAFFIC VIOLATION': 25977,
         'CRIMINAL VIOLATION': 10549,
         'CALL FOR SERVICE': 18271,
         'SUSPECT PERSON': 12146,
         'SUSPECT VEHICLE': 1001,
         'JUVENILE VIOLATION': 436,
         'OTHER': 3706,
         'FLAGGED DOWN': 1102,
         'CITIZEN CONTACT': 1698,
         'PRESENT AT CRIME SCENE': 240})

In [24]:
matching=pd.DataFrame({'men':[],'women':[]})
AOD = 0
# Match 1 woman and 1 man with same attributes
for reason in reason_stops:
    for race in ('white','black','hispanic'):
        for attractiveness in ('True', 'False'):
            for contraband in ('True', 'False'):
                query = "reason_for_stop=='"+reason+"' & subject_race=='"+race+"' & attractiveness=="+attractiveness+" & contraband_found=="+contraband
                data = df_neworleans.query(query).copy()
                id_M = np.array(data[data['subject_sex']=='male'].id)
                id_W = np.array(data[data['subject_sex']=='female'].id)
                if ((len(id_M)>0) & (len(id_W)>0)):
                    if (len(id_M)>len(id_W)):
                        for i in np.arange(0,len(id_W)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                          
                    if (len(id_M)<len(id_W)):
                        for i in np.arange(0,len(id_M)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
matching.head(5) 

Unnamed: 0,men,women
0,680.0,2221.0
1,1725.0,8899.0
2,2222.0,15361.0
3,3761.0,21729.0
4,4956.0,24344.0


In [25]:
# Averaged outcome difference
N_matches = len(matching)
diff = np.zeros(N_matches)
for i in np.arange(0,N_matches):
    outcome_man = np.array(df_neworleans[df_neworleans['id']==matching.loc[i,'men']].outcome)
    outcome_woman = np.array(df_neworleans[df_neworleans['id']==matching.loc[i,'women']].outcome)
    diff[i] = outcome_man-outcome_woman                    
AOD = diff.sum()/N_matches 

In [26]:
# Confidence interval

CI = bootstrap_CI(diff, 1000)

print('Averaged outcome difference %.4f with [%.4f, %.4f] 95%% confidence intervals' %(AOD, CI[0], CI[1]))


Averaged outcome difference -0.0715 with [-0.0775, -0.0654] 95% confidence intervals


## CA San diego

In [27]:
df_sandiego = df_sandiego.rename(columns={'Unnamed: 0':'id'})
# Age categorie (proxy for attractiveness)
df_sandiego['attractiveness'] = df_sandiego.apply(lambda x: True if (x.subject_age<46) else False, axis=1)
# Outcome value: arrest, citation = 1 and warning = 0
df_sandiego['outcome'] = df_sandiego.apply(lambda x: 1 if (x.arrest_made or x.citation_issued) else 0, axis=1)
df_sandiego.head(2)

Unnamed: 0,id,raw_row_number,date,time,service_area,subject_age,subject_race,subject_sex,type,arrest_made,...,contraband_found,search_conducted,search_person,search_vehicle,search_basis,reason_for_search,reason_for_stop,raw_action_taken,raw_subject_race_description,attractiveness
0,80,82,2014-01-02,13:30:00,Unknown,25.0,hispanic,male,vehicular,False,...,True,True,False,True,probable cause,Odor of Contraband,Moving Violation,Verbal Warning|FI,HISPANIC,True
1,127,131,2014-01-03,06:50:00,310,28.0,hispanic,male,vehicular,False,...,False,True,False,True,other,Inventory Search,Moving Violation,Citation,HISPANIC,True


In [29]:
# Possible reasons for stop
reason_stops = list(Counter(df_sandiego['reason_for_stop']).keys())
Counter(df_sandiego['reason_for_stop'])

Counter({'Moving Violation': 5910,
         'Equipment Violation': 6736,
         'Radio Call/Citizen Contact': 310,
         'Personal Knowledge/Informant': 197,
         'Muni, County, H&S Code': 120,
         'Suspect Info (I.S., Bulletin, Log)': 130,
         'Radio Call/Citizen Contact|Moving Violation': 1,
         'Equipment Violation|Equipment Violation|Moving Violation': 1,
         '&Moving Violation': 5,
         'Suspect Info': 7,
         'UNI, &County, H&&S Code': 3,
         'Equipment Violation|Moving Violation': 5,
         'Personal Observ/Knowledge': 7,
         nan: 4,
         'Moving Violation|Equipment Violation': 2,
         'Other': 2,
         '&Equipment Violation': 1,
         'Moving Violation|Suspect Info (I.S., Bulletin, Log)': 1,
         'MUNI, County, H&S Code': 1,
         'No Cause Specified on a Card': 2,
         'Muni, County, H&S Code|Moving Violation': 1})

we keep only stop reasons habing at least 100 instances

In [35]:
df_sandiego = df_sandiego.query("reason_for_stop=='Moving Violation' or reason_for_stop=='Equipment Violation' or reason_for_stop=='Radio Call/Citizen Contact' or reason_for_stop=='Personal Knowledge/Informant' or reason_for_stop=='Suspect Info (I.S., Bulletin, Log)' ")
reason_stops = list(Counter(df_sandiego['reason_for_stop']).keys())
Counter(df_sandiego['reason_for_stop'])


Counter({'Moving Violation': 5910,
         'Equipment Violation': 6736,
         'Radio Call/Citizen Contact': 310,
         'Personal Knowledge/Informant': 197,
         'Suspect Info (I.S., Bulletin, Log)': 130})

In [36]:
matching=pd.DataFrame({'men':[],'women':[]})
AOD = 0
# Match 1 woman and 1 man with same attributes
for reason in reason_stops:
    for race in ('white','black','hispanic'):
        for attractiveness in ('True', 'False'):
            for contraband in ('True', 'False'):
                query = "reason_for_stop=='"+reason+"' & subject_race=='"+race+"' & attractiveness=="+attractiveness+" & contraband_found=="+contraband
                data = df_sandiego.query(query).copy()
                id_M = np.array(data[data['subject_sex']=='male'].id)
                id_W = np.array(data[data['subject_sex']=='female'].id)
                if ((len(id_M)>0) & (len(id_W)>0)):
                    if (len(id_M)>len(id_W)):
                        for i in np.arange(0,len(id_W)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                          
                    if (len(id_M)<len(id_W)):
                        for i in np.arange(0,len(id_M)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
matching.head(5) 

Unnamed: 0,men,women
0,7407.0,8738.0
1,7429.0,10152.0
2,8893.0,16066.0
3,9929.0,16181.0
4,13063.0,21597.0


In [37]:
# Averaged outcome difference
N_matches = len(matching)
diff = np.zeros(N_matches)
for i in np.arange(0,N_matches):
    outcome_man = np.array(df_sandiego[df_sandiego['id']==matching.loc[i,'men']].outcome)
    outcome_woman = np.array(df_sandiego[df_sandiego['id']==matching.loc[i,'women']].outcome)
    diff[i] = outcome_man-outcome_woman                    
AOD = diff.sum()/N_matches 

In [38]:
# Confidence interval

CI = bootstrap_CI(diff, 1000)

print('Averaged outcome difference %.4f with [%.4f, %.4f] 95%% confidence intervals' %(AOD, CI[0], CI[1]))


Averaged outcome difference -0.1543 with [-0.1752, -0.1330] 95% confidence intervals


## CA San Francisco

In [39]:
df_sanfrancisco = df_sanfrancisco.rename(columns={'Unnamed: 0':'id'})
# Age categorie (proxy for attractiveness)
df_sanfrancisco['attractiveness'] = df_sanfrancisco.apply(lambda x: True if (x.subject_age<46) else False, axis=1)
# Outcome value: arrest, citation = 1 and warning = 0
df_sanfrancisco['outcome'] = df_sanfrancisco.apply(lambda x: 1 if (x.arrest_made or x.citation_issued) else 0, axis=1)
df_sanfrancisco.head(2)

Unnamed: 0,id,subject_age,subject_race,subject_sex,arrest_made,citation_issued,warning_issued,contraband_found,reason_for_stop,attractiveness,outcome
0,33182,39.0,hispanic,male,True,False,False,False,Moving Violation,True,1
1,33185,24.0,hispanic,female,True,False,False,True,Mechanical or Non-Moving Violation (V.C.),True,1


In [40]:
# Possible reasons for stop
reason_stops = list(Counter(df_sandiego['reason_for_stop']).keys())
Counter(df_sandiego['reason_for_stop'])

Counter({'Moving Violation': 5910,
         'Equipment Violation': 6736,
         'Radio Call/Citizen Contact': 310,
         'Personal Knowledge/Informant': 197,
         'Suspect Info (I.S., Bulletin, Log)': 130})

In [41]:
matching=pd.DataFrame({'men':[],'women':[]})
AOD = 0
# Match 1 woman and 1 man with same attributes
for reason in reason_stops:
    for race in ('white','black','hispanic'):
        for attractiveness in ('True', 'False'):
            for contraband in ('True', 'False'):
                query = "reason_for_stop=='"+reason+"' & subject_race=='"+race+"' & attractiveness=="+attractiveness+" & contraband_found=="+contraband
                data = df_sanfrancisco.query(query).copy()
                id_M = np.array(data[data['subject_sex']=='male'].id)
                id_W = np.array(data[data['subject_sex']=='female'].id)
                if ((len(id_M)>0) & (len(id_W)>0)):
                    if (len(id_M)>len(id_W)):
                        for i in np.arange(0,len(id_W)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                          
                    if (len(id_M)<len(id_W)):
                        for i in np.arange(0,len(id_M)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
matching.head(5) 

Unnamed: 0,men,women
0,33434.0,33806.0
1,33523.0,34526.0
2,34336.0,35148.0
3,34825.0,36784.0
4,36443.0,41237.0


In [43]:
# Averaged outcome difference
N_matches = len(matching)
diff = np.zeros(N_matches)
for i in np.arange(0,N_matches):
    outcome_man = np.array(df_sanfrancisco[df_sanfrancisco['id']==matching.loc[i,'men']].outcome)
    outcome_woman = np.array(df_sanfrancisco[df_sanfrancisco['id']==matching.loc[i,'women']].outcome)
    diff[i] = outcome_man-outcome_woman                    
AOD = diff.sum()/N_matches 

In [44]:
# Confidence interval

CI = bootstrap_CI(diff, 1000)

print('Averaged outcome difference %.4f with [%.4f, %.4f] 95%% confidence intervals' %(AOD, CI[0], CI[1]))


Averaged outcome difference 0.0207 with [0.0034, 0.0400] 95% confidence intervals


## CN Greensboro

In [46]:
df_greensboro = df_greensboro.rename(columns={'Unnamed: 0':'id'})
# Age categorie (proxy for attractiveness)
df_greensboro['attractiveness'] = df_greensboro.apply(lambda x: True if (x.subject_age<46) else False, axis=1)
# Outcome value: arrest, citation = 1 and warning = 0
df_greensboro['outcome'] = df_greensboro.apply(lambda x: 1 if (x.arrest_made or x.citation_issued) else 0, axis=1)
df_greensboro.head(2)

Unnamed: 0,id,subject_age,subject_race,subject_sex,arrest_made,citation_issued,warning_issued,contraband_found,frisk_performed,search_person,search_vehicle,reason_for_frisk,reason_for_search,reason_for_stop,attractiveness,outcome
0,21,23.0,white,female,False,False,False,False,False,True,True,,Observation of Suspected Contraband,Speed Limit Violation,True,0
1,26,22.0,white,female,True,False,False,False,False,True,True,,Erratic/Suspicious Behavior,Speed Limit Violation,True,1


In [48]:
# Possible reasons for stop
reason_stops = list(Counter(df_greensboro['reason_for_stop']).keys())
Counter(df_greensboro['reason_for_stop'])

Counter({'Speed Limit Violation': 4661,
         'Driving While Impaired': 1197,
         'Stop Light/Sign Violation': 1741,
         'Safe Movement Violation': 3753,
         'Vehicle Regulatory Violation': 6929,
         'Investigation': 4805,
         'Other Motor Vehicle Violation': 1769,
         'Vehicle Equipment Violation': 4633,
         'Seat Belt Violation': 2354,
         'Checkpoint': 93})

In [49]:
matching=pd.DataFrame({'men':[],'women':[]})
AOD = 0
# Match 1 woman and 1 man with same attributes
for reason in reason_stops:
    for race in ('white','black','hispanic'):
        for attractiveness in ('True', 'False'):
            for contraband in ('True', 'False'):
                query = "reason_for_stop=='"+reason+"' & subject_race=='"+race+"' & attractiveness=="+attractiveness+" & contraband_found=="+contraband
                data = df_greensboro.query(query).copy()
                id_M = np.array(data[data['subject_sex']=='male'].id)
                id_W = np.array(data[data['subject_sex']=='female'].id)
                if ((len(id_M)>0) & (len(id_W)>0)):
                    if (len(id_M)>len(id_W)):
                        for i in np.arange(0,len(id_W)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                          
                    if (len(id_M)<len(id_W)):
                        for i in np.arange(0,len(id_M)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
matching.head(5)

Unnamed: 0,men,women
0,105.0,372.0
1,599.0,6598.0
2,926.0,7871.0
3,1566.0,21307.0
4,4045.0,21968.0


In [50]:
# Averaged outcome difference
N_matches = len(matching)
diff = np.zeros(N_matches)
for i in np.arange(0,N_matches):
    outcome_man = np.array(df_greensboro[df_greensboro['id']==matching.loc[i,'men']].outcome)
    outcome_woman = np.array(df_greensboro[df_greensboro['id']==matching.loc[i,'women']].outcome)
    diff[i] = outcome_man-outcome_woman                    
AOD = diff.sum()/N_matches 

In [51]:
# Confidence interval

CI = bootstrap_CI(diff, 1000)

print('Averaged outcome difference %.4f with [%.4f, %.4f] 95%% confidence intervals' %(AOD, CI[0], CI[1]))


Averaged outcome difference 0.0433 with [0.0317, 0.0546] 95% confidence intervals


## CN Raleigh

In [52]:
df_raleigh = df_raleigh.rename(columns={'Unnamed: 0':'id'})
# Age categorie (proxy for attractiveness)
df_raleigh['attractiveness'] = df_raleigh.apply(lambda x: True if (x.subject_age<46) else False, axis=1)
# Outcome value: arrest, citation = 1 and warning = 0
df_raleigh['outcome'] = df_raleigh.apply(lambda x: 1 if (x.arrest_made or x.citation_issued) else 0, axis=1)
df_raleigh.head(2)

Unnamed: 0,id,subject_age,subject_race,subject_sex,arrest_made,citation_issued,warning_issued,contraband_found,frisk_performed,search_person,search_vehicle,reason_for_frisk,reason_for_search,reason_for_stop,attractiveness,outcome
0,17,21,hispanic,male,True,False,False,False,False,True,True,,Erratic/Suspicious Behavior,Driving While Impaired,True,1
1,40,34,hispanic,male,True,False,False,False,False,True,True,,Observation of Suspected Contraband,Driving While Impaired,True,1


In [53]:
# Possible reasons for stop
reason_stops = list(Counter(df_greensboro['reason_for_stop']).keys())
Counter(df_greensboro['reason_for_stop'])

Counter({'Speed Limit Violation': 4661,
         'Driving While Impaired': 1197,
         'Stop Light/Sign Violation': 1741,
         'Safe Movement Violation': 3753,
         'Vehicle Regulatory Violation': 6929,
         'Investigation': 4805,
         'Other Motor Vehicle Violation': 1769,
         'Vehicle Equipment Violation': 4633,
         'Seat Belt Violation': 2354,
         'Checkpoint': 93})

In [54]:
matching=pd.DataFrame({'men':[],'women':[]})
AOD = 0
# Match 1 woman and 1 man with same attributes
for reason in reason_stops:
    for race in ('white','black','hispanic'):
        for attractiveness in ('True', 'False'):
            for contraband in ('True', 'False'):
                query = "reason_for_stop=='"+reason+"' & subject_race=='"+race+"' & attractiveness=="+attractiveness+" & contraband_found=="+contraband
                data = df_raleigh.query(query).copy()
                id_M = np.array(data[data['subject_sex']=='male'].id)
                id_W = np.array(data[data['subject_sex']=='female'].id)
                if ((len(id_M)>0) & (len(id_W)>0)):
                    if (len(id_M)>len(id_W)):
                        for i in np.arange(0,len(id_W)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
                          
                    if (len(id_M)<len(id_W)):
                        for i in np.arange(0,len(id_M)):
                            matching = matching.append({'men':id_M[i],'women':id_W[i]},ignore_index=True)
matching.head(5)

Unnamed: 0,men,women
0,1195.0,13898.0
1,1489.0,19444.0
2,2213.0,106859.0
3,2237.0,107943.0
4,2654.0,144535.0


In [58]:
# Averaged outcome difference
N_matches = len(matching)
diff = np.zeros(N_matches)
for i in np.arange(0,N_matches):
    outcome_man = np.array(df_raleigh[df_raleigh['id']==matching.loc[i,'men']].outcome)
    outcome_woman = np.array(df_raleigh[df_raleigh['id']==matching.loc[i,'women']].outcome)
    diff[i] = outcome_man-outcome_woman                    
AOD = diff.sum()/N_matches 

In [59]:
# Confidence interval

CI = bootstrap_CI(diff, 1000)

print('Averaged outcome difference %.4f with [%.4f, %.4f] 95%% confidence intervals' %(AOD, CI[0], CI[1]))


Averaged outcome difference 0.0775 with [0.0685, 0.0872] 95% confidence intervals
