In [55]:
import pandas as pd
import json
import pickle
import joblib
import numpy as np
from scipy.stats import t
from sklearn.base import BaseEstimator, TransformerMixin


# Functions

In [None]:
def get_significance_unfair(df,original_full_df):
    curr_depart = original_full_df.iloc[df.index].index.get_level_values(0)
    #print(curr_depart)
    group = original_full_df.loc[curr_depart].sort_values(by="Mean")
    #print(group)
    #print(f"the max mean is {group.iloc[-1].ContrabandIndicator} and the curr mean is {df.ContrabandIndicator}")
    n1 = df.Count.item()
    n2 = group.iloc[-1].Count.item()
    degrees= n1+n2-2
    if (degrees <=0 or n1<15 or n2<15): #if the sample size is not big enough, the significance will be zero
        return 0
    else:
        diff_means = group.iloc[-1].Mean  - df.Mean 
        sigma = np.sqrt(df.Var/n1 + group.iloc[-1].Var/n2)
        if(sigma.item() == 0):
            return 0
        t_statistic = diff_means/sigma
        probability_of_discrimination = 1-float(1-t.cdf(t_statistic,df=degrees))*2
        if(probability_of_discrimination>0.1):
            return diff_means.item()
        else:
            return 0

In [81]:
class control_discrimination(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.t_score_race_train = pd.read_csv('t_score_race_train.csv').set_index(['Department Name','Race_Ethnicity'])
        self.t_score_sex_train = pd.read_csv('t_score_sex_train.csv').set_index(['Department Name','SubjectSexCode'])
    def fit(self):

        return self
    def transform(self,df,y=None):
        df_ = df.copy()
        df_['Race_Ethnicity'] = df_.apply(self.four_races,axis=1)
        df_ = df_.apply(self.get_tscore,axis=1)
        df_['AdjustedContrabandIndicator'] = df_.apply(self.invert_if_discriminated,axis=1,args=['diff_means_race'])
        df_['AdjustedContrabandIndicator'] *= df_.apply(self.invert_if_discriminated,axis=1,args=['diff_means_sex'])
        
        return df_


    def get_tscore(self,df):
        t_score_race_train=self.t_score_race_train
        t_score_sex_train=self.t_score_sex_train
        try:       
            t_score_race = t_score_race_train.loc[df['Department Name'],df['Race_Ethnicity']].item()
        except:
            t_score_race = 0
        try:       
            t_score_sex = t_score_sex_train.loc[df['Department Name'],df['SubjectSexCode']].item()
        except:
            t_score_sex = 0
        df['diff_means_race']=t_score_race
        df['diff_means_sex']=t_score_sex

        return df
    def four_races(self,df):
        if(df.SubjectEthnicityCode=='H'):
            return 'H'
        elif(df.SubjectRaceCode=='W'):
            return 'W'
        elif(df.SubjectRaceCode=='B'):
            return('B')

        else:
            return 'O'
    def invert_if_discriminated(self,df,column_diff_means):
        diff = df[column_diff_means]
        if(df.ContrabandPredicted==0):
            return 0
        try:
            if(df.AdjustedContrabandIndicator==0):
                return 0
        except:
            pass
        
        result = (df['PredictedProbas']>k+(diff*1/0.7))*1
        if(result == 0):
            print("removed a ",df['Department Name'])
        return result

           

### In order to find the discriminated departments, we made a t-student test on each class in each department, according to the technical details in the report. The following function makes a t-student test on each class in each department in comparison to the less discriminated class in the same department.

In [7]:
with open('columns.json') as fh:
    columns = json.load(fh)

model = joblib.load('modelv2.pickle')

with open('dtypes.pickle', 'rb') as fh:
    dtypes = pickle.load(fh)

In [None]:
def four_races(df):
    if(df.SubjectEthnicityCode=='H'):
        return 'H'
    elif(df.SubjectRaceCode=='W'):
        return 'W'
    elif(df.SubjectRaceCode=='B'):
        return('B')

    else:
        return 'O'

In [8]:
data = pd.read_csv("data/train.csv")
data = data[data.VehicleSearchedIndicator]

data['InterventionDateTime']= pd.to_datetime(data['InterventionDateTime'])
data_train = data[(data['InterventionDateTime']<'2018') ]
data_test = data[data['InterventionDateTime']>'2018']

X_train,y_train = data_train.drop('ContrabandIndicator',axis=1),data_train.ContrabandIndicator*1
X_test,y_test = data_test.drop('ContrabandIndicator',axis=1),data_test.ContrabandIndicator*1
data.head()

Unnamed: 0,VehicleSearchedIndicator,ContrabandIndicator,Department Name,InterventionDateTime,InterventionLocationName,InterventionReasonCode,ReportingOfficerIdentificationID,ResidentIndicator,SearchAuthorizationCode,StatuteReason,SubjectAge,SubjectEthnicityCode,SubjectRaceCode,SubjectSexCode,TownResidentIndicator
71,True,False,Bridgeport,2013-10-01 00:46:00,Bridgeport,V,1207,True,I,Speed Related,37.0,H,W,M,True
143,True,True,Milford,2013-10-01 01:50:00,MILFORD,E,2325,True,I,Defective Lights,30.0,N,W,M,True
184,True,False,Torrington,2013-10-01 03:49:00,Torrington,V,DACYR048,True,C,Registration,43.0,N,W,M,True
203,True,False,State Police,2013-10-01 05:30:00,TOLLAND,V,1000002715,False,C,Other,19.0,N,B,M,False
212,True,False,Greenwich,2013-10-01 05:47:00,RIVERSIDE,V,110993,False,N,Speed Related,34.0,M,B,M,False


In [None]:
data['Race_Ethnicity'] = data.apply(four_races,axis=1)

# On the original dataset

In [100]:
#class_to_study = 'Race_Ethnicity'
class_to_study = 'SubjectSexCode'

In [101]:
depart_race_mean_stats = data[['Department Name',class_to_study,'ContrabandIndicator']].groupby(['Department Name',class_to_study]).agg(Count=('ContrabandIndicator', 'count'), Mean=('ContrabandIndicator', 'mean'),Var=('ContrabandIndicator','var'),Nr_Caught=('ContrabandIndicator','sum'))
depart_race_mean_stats = depart_race_mean_stats.fillna(0)
#calculate t-score
depart_race_t_test = depart_race_mean_stats.reset_index().groupby(['Department Name',class_to_study]).apply(get_significance_unfair,depart_race_mean_stats)
#add to previous stats
depart_race_t_test = pd.DataFrame(depart_race_t_test).rename(columns={0:"t_score"}) 
depart_race_t_test = depart_race_mean_stats.join(depart_race_t_test)
problematic_departs=depart_race_t_test[depart_race_t_test.t_score>0.1].index.get_level_values(0)
print(len(problematic_departs))
for c in problematic_departs:
    print(c,end=', ')

pd.options.display.max_rows = 999
depart_race_t_test.loc[problematic_departs].t_score.unstack(1).fillna(0)

  import sys
  from ipykernel import kernelapp as app


16
Bristol, CSP Troop F, Canton, East Hampton, East Windsor, Groton City, Hartford, Madison, Monroe, Plymouth, Putnam, Ridgefield, Thomaston, Windsor, Winsted, Yale, 

SubjectSexCode,F,M
Department Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bristol,0.119329,0.0
CSP Troop F,0.0,0.101009
Canton,0.140827,0.0
East Hampton,0.14016,0.0
East Windsor,0.204261,0.0
Groton City,0.0,0.135618
Hartford,0.0,0.102086
Madison,0.0,0.310102
Monroe,0.105988,0.0
Plymouth,0.17705,0.0


# Learning the patterns of discrimination of the model in the training set

In [102]:
train_preds = model.predict_proba(X_train)[:,1]
k=0.3939

In [103]:
data_train['Race_Ethnicity'] = data_train.apply(four_races,axis=1)    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [104]:
results = data_train.join(pd.DataFrame(zip(train_preds,(train_preds>k)*1),columns=['PredictedProbas','ContrabandPredicted'],index= X_train.index )) 
results = results[results.ContrabandPredicted==1]
results.head(3)

Unnamed: 0,VehicleSearchedIndicator,ContrabandIndicator,Department Name,InterventionDateTime,InterventionLocationName,InterventionReasonCode,ReportingOfficerIdentificationID,ResidentIndicator,SearchAuthorizationCode,StatuteReason,SubjectAge,SubjectEthnicityCode,SubjectRaceCode,SubjectSexCode,TownResidentIndicator,Race_Ethnicity,PredictedProbas,ContrabandPredicted
203,True,False,State Police,2013-10-01 05:30:00,TOLLAND,V,1000002715,False,C,Other,19.0,N,B,M,False,B,0.405411,1
252,True,True,State Police,2013-10-01 06:36:00,HAMPTON,V,1000002605,True,O,Other,22.0,N,B,M,False,B,0.729149,1
708,True,False,Naugatuck,2013-10-01 12:40:00,Naugatuck,V,PL22,True,O,Display of Plates,29.0,N,B,M,False,B,0.614147,1


In [105]:
class_to_study = 'Race_Ethnicity'
#class_to_study = 'SubjectSexCode'

In [106]:
depart_race_mean_stats = results[['Department Name',class_to_study,'ContrabandIndicator']].groupby(['Department Name',class_to_study]).agg(Count=('ContrabandIndicator', 'count'), Mean=('ContrabandIndicator', 'mean'),Var=('ContrabandIndicator','var'),Nr_Caught=('ContrabandIndicator','sum'))
depart_race_mean_stats = depart_race_mean_stats.fillna(0)

### Calculate t-score and save a file with the learnt differences

In [73]:
#calculate t-score
depart_race_t_test = depart_race_mean_stats.reset_index().groupby(['Department Name',class_to_study]).apply(get_significance_unfair,depart_race_mean_stats)
#add to previous stats
depart_race_t_test = pd.DataFrame(depart_race_t_test).rename(columns={0:"t_score"}) 
depart_race_t_test = depart_race_mean_stats.join(depart_race_t_test)
#depart_race_t_test.loc[depart_race_t_test.index.get_level_values(0)]
depart_race_t_test.loc[depart_race_t_test[depart_race_t_test.t_score>0.1].index.get_level_values(0)][['t_score']].to_csv('t_score_'+['race' if class_to_study == 'Race_Ethnicity' else 'sex'][0]+'_train.csv')


  import sys
  from ipykernel import kernelapp as app


# On the results of the model applied to the test set

In [86]:
preds = model.predict_proba(X_test)[:,1]
y_pred = (preds>k)*1

In [87]:
results = data_test.join(pd.DataFrame(zip(preds,y_pred),columns=['PredictedProbas','ContrabandPredicted'],index= X_test.index )) 
results['Race_Ethnicity'] = results.apply(four_races,axis=1)
results = results[results.ContrabandPredicted==1]

In [88]:
depart_race_mean_stats = results[['Department Name',class_to_study,'ContrabandIndicator']].groupby(['Department Name',class_to_study]).agg(Count=('ContrabandIndicator', 'count'), Mean=('ContrabandIndicator', 'mean'),Var=('ContrabandIndicator','var'),Nr_Caught=('ContrabandIndicator','sum'))
depart_race_mean_stats = depart_race_mean_stats.fillna(0)

In [89]:
#calculate t-score
depart_race_t_test = depart_race_mean_stats.reset_index().groupby(['Department Name',class_to_study]).apply(get_significance_unfair,depart_race_mean_stats)
#add to previous stats
depart_race_t_test = pd.DataFrame(depart_race_t_test).rename(columns={0:"t_score"}) 
depart_race_t_test = depart_race_mean_stats.join(depart_race_t_test)
problematic_deps = depart_race_t_test[depart_race_t_test.t_score>0.1].index.get_level_values(0)
print(problematic_deps)
print(len(problematic_deps))
depart_race_t_test.loc[problematic_deps]

  import sys
  from ipykernel import kernelapp as app


Index(['CSP Troop E', 'CSP Troop G', 'CSP Troop K', 'Enfield', 'Fairfield',
       'Glastonbury', 'Groton Town', 'Middletown', 'New Britain', 'Norwich',
       'Putnam', 'Southington', 'Stratford', 'Trumbull', 'Wallingford',
       'West Hartford', 'Westport', 'Yale'],
      dtype='object', name='Department Name')
18


Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Mean,Var,Nr_Caught,t_score
Department Name,SubjectSexCode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CSP Troop E,F,16,0.375,0.25,6.0,0.125
CSP Troop E,M,72,0.5,0.253521,36.0,0.0
CSP Troop G,F,18,0.222222,0.183007,4.0,0.240741
CSP Troop G,M,108,0.462963,0.250952,50.0,0.0
CSP Troop K,F,18,0.555556,0.261438,10.0,0.0
CSP Troop K,M,66,0.454545,0.251748,30.0,0.10101
Enfield,F,48,0.541667,0.253546,26.0,0.163251
Enfield,M,122,0.704918,0.209728,86.0,0.0
Fairfield,F,20,0.5,0.263158,10.0,0.155172
Fairfield,M,116,0.655172,0.227886,76.0,0.0


# On the adjusted results of the model applied to the test set

In [90]:
ad = control_discrimination()
results = ad.transform(results)



removed a  Groton City
removed a  Groton City
removed a  Naugatuck
removed a  Naugatuck
removed a  Westport
removed a  Westport
removed a  Greenwich
removed a  Greenwich
removed a  Wallingford
removed a  Wallingford
removed a  Westport
removed a  Westport
removed a  Wethersfield
removed a  Wethersfield
removed a  Hartford
removed a  Hartford
removed a  Wethersfield
removed a  Wethersfield
removed a  Rocky Hill
removed a  Rocky Hill
removed a  Glastonbury
removed a  Glastonbury
removed a  East Hartford
removed a  East Hartford
removed a  Glastonbury
removed a  Glastonbury
removed a  Westport
removed a  Westport
removed a  East Hartford
removed a  East Hartford
removed a  East Hartford
removed a  East Hartford
removed a  East Hartford
removed a  East Hartford
removed a  East Hartford
removed a  East Hartford
removed a  Westport
removed a  Westport
removed a  Groton City
removed a  Groton City
removed a  Groton City
removed a  Groton City
removed a  Greenwich
removed a  Greenwich
removed 

removed a  Willimantic
removed a  Willimantic
removed a  CSP Troop E
removed a  CSP Troop E
removed a  Westport
removed a  Westport
removed a  CSP Troop F
removed a  CSP Troop F
removed a  Westport
removed a  Westport
removed a  Greenwich
removed a  Greenwich
removed a  Wethersfield
removed a  Wethersfield
removed a  Wethersfield
removed a  Wethersfield
removed a  CSP Troop D
removed a  CSP Troop D
removed a  Old Saybrook
removed a  Old Saybrook
removed a  Wethersfield
removed a  Wethersfield
removed a  Groton City
removed a  Groton City
removed a  East Hartford
removed a  East Hartford
removed a  CSP Troop F
removed a  CSP Troop F
removed a  Old Saybrook
removed a  Old Saybrook
removed a  Westport
removed a  Westport
removed a  Wallingford
removed a  Wallingford
removed a  Hartford
removed a  Hartford
removed a  Wallingford
removed a  Wallingford
removed a  CSP Troop E
removed a  CSP Troop E
removed a  Plainville
removed a  Plainville
removed a  Wallingford
removed a  Wallingford
remo

In [91]:
adjusted_results = results[results.AdjustedContrabandIndicator==1]
len(adjusted_results)

4746

In [92]:
depart_race_mean_stats_adjusted = adjusted_results[['Department Name',class_to_study,'ContrabandIndicator']].groupby(['Department Name',class_to_study]).agg(Count=('ContrabandIndicator', 'count'), Mean=('ContrabandIndicator', 'mean'),Var=('ContrabandIndicator','var'),Nr_Caught=('ContrabandIndicator','sum'))
depart_race_mean_stats_adjusted = depart_race_mean_stats_adjusted.fillna(0)

#calculate t-score
depart_race_t_test_adjusted = depart_race_mean_stats_adjusted.reset_index().groupby(['Department Name',class_to_study]).apply(get_significance_unfair,depart_race_mean_stats_adjusted)
#add to previous stats
depart_race_t_test_adjusted = pd.DataFrame(depart_race_t_test_adjusted).rename(columns={0:"t_score"}) 
depart_race_t_test_adjusted = depart_race_mean_stats_adjusted.join(depart_race_t_test_adjusted)
problematic_deps = depart_race_t_test_adjusted[depart_race_t_test_adjusted.t_score>0.1].index.get_level_values(0)
print(problematic_deps)
print(len(problematic_deps))
depart_race_t_test_adjusted.loc[depart_race_t_test_adjusted[depart_race_t_test_adjusted.t_score>0.1].index.get_level_values(0)]

  import sys
  from ipykernel import kernelapp as app


Index(['CSP Troop G', 'CSP Troop K', 'Enfield', 'Fairfield', 'Groton Town',
       'Middletown', 'New Britain', 'Norwich', 'Putnam', 'Southington',
       'Stratford', 'Trumbull', 'Wallingford', 'West Hartford'],
      dtype='object', name='Department Name')
14


Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Mean,Var,Nr_Caught,t_score
Department Name,SubjectSexCode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CSP Troop G,F,18,0.222222,0.183007,4.0,0.240741
CSP Troop G,M,108,0.462963,0.250952,50.0,0.0
CSP Troop K,F,18,0.555556,0.261438,10.0,0.0
CSP Troop K,M,66,0.454545,0.251748,30.0,0.10101
Enfield,F,48,0.541667,0.253546,26.0,0.163251
Enfield,M,122,0.704918,0.209728,86.0,0.0
Fairfield,F,20,0.5,0.263158,10.0,0.155172
Fairfield,M,116,0.655172,0.227886,76.0,0.0
Groton Town,F,34,0.411765,0.249554,14.0,0.117647
Groton Town,M,68,0.529412,0.252853,36.0,0.0
