In [61]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sbn

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt

from balancers import PredictionBalancer
import tools

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline

from fairlearn.reductions import ExponentiatedGradient, BoundedGroupLoss, ZeroOneLoss, DemographicParity

from utils import *
from demv import DEMV
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')

In [62]:
def prepare_data():
  '''Load the data and apply some pre-processing steps'''
  
  data = pd.read_excel('data2/crime_data_normalized.xlsx', na_values='?')
  data.drop(['state', 'county', 'community', 'communityname',
            'fold', 'OtherPerCap'], axis=1, inplace=True)
  na_cols = data.isna().any()[data.isna().any() == True].index
  data.drop(na_cols, axis=1, inplace=True)
  data = (data - data.mean())/data.std()
  y_classes = np.quantile(data['ViolentCrimesPerPop'].values, [
                          0, 0.2, 0.4, 0.6, 0.8, 1])
  i = 0
  data['ViolentCrimesClass'] = data['ViolentCrimesPerPop']
  for cl in y_classes:
    data.loc[data['ViolentCrimesClass'] <= cl, 'ViolentCrimesClass'] = i*100
    i += 1
  data.drop('ViolentCrimesPerPop', axis=1, inplace=True)
  data['black_people'] = data['racepctblack'] > -0.45
  data['hisp_people'] = data['racePctHisp'] > -0.4
  data['black_people'] = data['black_people'].astype(int)
  data['hisp_people'] = data['hisp_people'].astype(int)
  data.drop('racepctblack', axis=1, inplace=True)
  data.drop('racePctHisp', axis=1, inplace=True)
  return data


In [63]:
data = prepare_data()

In [64]:
data

Unnamed: 0,population,householdsize,racePctWhite,racePctAsian,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,...,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,ViolentCrimesClass,black_people,hisp_people
0,1.043350,-0.814793,0.599427,-0.161248,-0.542654,-0.166244,-0.277853,-0.575741,1.059815,0.682833,...,-0.193271,-0.580608,-0.058165,0.500363,0.133666,0.167274,0.940163,300.0,0,1
1,-0.453823,-1.853172,-0.056205,1.418626,-1.058133,0.669623,0.082497,-0.854782,-0.343627,0.682833,...,-1.075534,-0.131778,-0.663550,-0.413220,-0.555677,1.258716,-0.391349,500.0,1,1
2,-0.453823,-0.265063,-0.793790,0.078127,-0.220481,-0.166244,-0.337911,-0.575741,-0.499564,-1.565315,...,0.027294,0.217312,-0.461755,-0.504578,-0.112528,-0.618564,-0.391349,500.0,1,0
3,-0.138629,1.872776,-2.760686,-0.161248,0.552737,0.042723,0.022439,-1.189630,-0.031751,0.682833,...,1.074981,0.067702,-0.007717,-0.413220,0.773770,0.516536,-0.391349,300.0,1,1
4,-0.375024,0.528992,0.804312,-0.304873,-0.284915,-0.793144,-0.638204,-0.352509,-0.343627,0.458018,...,0.578708,-0.081908,-0.613101,-0.230503,-0.703394,-0.618564,-0.391349,100.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,-0.375024,-0.387225,0.476496,-0.161248,0.037258,0.112378,0.082497,-0.687357,-0.265658,0.682833,...,-1.075534,-0.730218,-1.319383,-0.504578,0.232143,-0.487591,-0.391349,200.0,1,1
1990,-0.059830,3.033317,-1.941146,3.237874,1.712562,2.550324,2.364717,-1.580287,-0.031751,0.682833,...,-2.012937,-2.625277,-3.286882,-0.413220,0.675293,0.167274,-0.391349,500.0,1,1
1991,0.806954,-0.570468,-0.261090,-0.544248,-0.478220,0.042723,-0.157736,0.652037,0.825908,0.682833,...,0.413284,0.815752,0.547219,0.134930,0.429099,0.079959,3.395139,400.0,1,1
1992,0.176566,0.284667,0.476496,0.317502,1.003780,1.714457,1.764133,-0.073469,0.202156,0.682833,...,0.027294,-0.181648,-0.663550,-0.321862,0.724532,0.734824,0.524066,300.0,0,1


The label to predict is ViolentCrimesClass, and the protected features are black_people and hisp_people.

In [65]:
label = 'ViolentCrimesClass'
groups_condition = {'black_people': 1, 'hisp_people': 1}
sensitive_features = ['black_people', 'hisp_people']
positive_label = 100

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(n_jobs=-1))
])

In [66]:
model, lr_bias, pred = cross_val(pipeline, data, label, groups_condition, sensitive_features, positive_label=positive_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [67]:
pred

[[100.0,
  200.0,
  300.0,
  100.0,
  100.0,
  100.0,
  100.0,
  300.0,
  300.0,
  500.0,
  500.0,
  500.0,
  400.0,
  500.0,
  500.0,
  100.0,
  400.0,
  400.0,
  400.0,
  300.0,
  100.0,
  100.0,
  200.0,
  100.0,
  100.0,
  200.0,
  200.0,
  500.0,
  400.0,
  100.0,
  100.0,
  100.0,
  300.0,
  200.0,
  500.0,
  100.0,
  400.0,
  100.0,
  500.0,
  100.0,
  200.0,
  100.0,
  500.0,
  300.0,
  300.0,
  300.0,
  400.0,
  300.0,
  100.0,
  400.0,
  300.0,
  100.0,
  200.0,
  100.0,
  100.0,
  500.0,
  100.0,
  100.0,
  400.0,
  300.0,
  500.0,
  300.0,
  200.0,
  500.0,
  500.0,
  500.0,
  500.0,
  200.0,
  400.0,
  100.0,
  500.0,
  200.0,
  400.0,
  100.0,
  300.0,
  200.0,
  100.0,
  400.0,
  500.0,
  400.0,
  200.0,
  400.0,
  400.0,
  100.0,
  200.0,
  100.0,
  300.0,
  100.0,
  300.0,
  100.0,
  400.0,
  500.0,
  500.0,
  100.0,
  100.0,
  100.0,
  400.0,
  300.0,
  400.0,
  300.0,
  400.0,
  100.0,
  500.0,
  100.0,
  100.0,
  100.0,
  300.0,
  500.0,
  400.0,
  500.0,
  500.0,
 

In [68]:
y = np.array(data.ViolentCrimesClass == 500.0, dtype=np.uint8)

In [69]:
y_ = []
for list in pred:
    for j in list:
        y_.append(int(j))

y_ = np.array(y_)        


In [70]:
y_ = np.array(y_ == 500.0 , dtype=np.uint8)

In [71]:
a = data['black_people']

In [72]:
stats =  tools.clf_metrics(y, y_)
stats

Unnamed: 0,tp,fp,tn,fn,sens,spec,ppv,npv,j,f1,mcc,brier,auc,ap,true_prev,pred_prev,prev_diff,rel_prev_diff
0,84.0,301.0,1307.0,302.0,0.2176,0.8128,0.2182,0.8123,0.0304,0.2179,0.030458,0.3024,0,0,386.0,385.0,-1.0,-0.0026


In [73]:
pb = PredictionBalancer(y=y, y_=y_, a=a)


Pre-adjustment group rates are 

 group    fpr    tpr
   0.0 0.1946 0.1698
   1.0 0.1758 0.2252

And loss is 0.3024



In [74]:
y_pred = pb.adjust(goal='odds', summary=True)


Post-adjustment group rates are 

 group  fpr  tpr
   0.0  0.0  0.0
   1.0  0.0  0.0

And loss is 0.1936



In [75]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [76]:
a = data['hisp_people']

In [77]:
pb = PredictionBalancer(y=y, y_=y_, a=a)


Pre-adjustment group rates are 

 group    fpr    tpr
   0.0 0.1899 0.2096
   1.0 0.1831 0.2237

And loss is 0.3024



In [78]:
pb.adjust(goal = 'odds', summary = False)

In [79]:
pb.summary(org = False)


Post-adjustment group rates are 

 group    fpr    tpr
   0.0 0.1899 0.2096
   1.0 0.1941 0.1918

And loss is 0.3094



In [80]:
a = data[['black_people','hisp_people']]

pb = PredictionBalancer(y=y, y_=y_, a=a)


Pre-adjustment group rates are 

 group    fpr    tpr
   0.0 0.1923 0.2000
   1.0 0.1795 0.2246

And loss is 0.3024



In [81]:
pb.adjust(goal = 'odds', summary = True)

ValueError: Unable to coerce to Series, length must be 2: given 1994