In [1]:
!pip install aequitas

import yaml
import os
import pandas as pd
import numpy as np
import seaborn as sns
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
import aequitas.plot as ap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
DATAPATH = 'https://github.com/dssg/fairness_tutorial/raw/master/data/'

# Can resampling approaches help improve the fairness of our models?

1. Load data
2. Look at training data distributions
3. Try Resampling in a few different ways
4. Rebuild model(s) on resampled training data
5. Predict on the test set
6. Audit for Bias and Compare

## Load data

In [2]:
traindf = pd.read_csv(DATAPATH + 'train_20120501_20120801.csv.gz', compression='gzip')
testdf = pd.read_csv(DATAPATH + 'test_20121201_20130201.csv.gz', compression='gzip')
train_attrdf = pd.read_csv(DATAPATH + 'train_20120501_20120801_protected.csv.gz', compression='gzip')
test_attrdf = pd.read_csv(DATAPATH + 'test_20121201_20130201_protected.csv.gz', compression='gzip')


In [3]:
traindf.shape

(16790, 113)

In [4]:
train_attrdf['poverty_level'].value_counts()

highest    9448
lower      7342
Name: poverty_level, dtype: int64

## Load pre-built models and predictions

In [5]:
evals_df = pd.read_csv(DATAPATH +'split2_evals.csv.gz', compression='gzip')

## Take a look at the "Best" performing model

In [6]:
evals_df[evals_df['model_uuid']=='a04e2eedd9c5ff18bcf77e84ae9db561']

Unnamed: 0,model_precision,model_classpath,hyperparameters,model_uuid,predictions_uuid,target_pp,matrix_type,matrix_start_date,matrix_end_date
0,0.552,sklearn.ensemble.RandomForestClassifier,"{""n_jobs"": -1, ""criterion"": ""gini"", ""max_depth...",a04e2eedd9c5ff18bcf77e84ae9db561,c598fbe93f4c218ac7d325fb478598f1,1000,test,2012-12-01,2013-01-31


In [7]:
import ast
hyperparameters= ast.literal_eval(evals_df['hyperparameters'][0])

In [8]:
rf = RandomForestClassifier(**hyperparameters)

In [9]:
rf.__dict__

{'base_estimator': DecisionTreeClassifier(),
 'n_estimators': 87,
 'estimator_params': ('criterion',
  'max_depth',
  'min_samples_split',
  'min_samples_leaf',
  'min_weight_fraction_leaf',
  'max_features',
  'max_leaf_nodes',
  'min_impurity_decrease',
  'min_impurity_split',
  'random_state',
  'ccp_alpha'),
 'bootstrap': True,
 'oob_score': False,
 'n_jobs': -1,
 'random_state': 213500298,
 'verbose': 0,
 'warm_start': False,
 'class_weight': None,
 'max_samples': None,
 'criterion': 'gini',
 'max_depth': 30,
 'min_samples_split': 3,
 'min_samples_leaf': 44,
 'min_weight_fraction_leaf': 0.0,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'ccp_alpha': 0.0}

## Look at training data distributions

### Poverty_level=Highest
label_pos_poverty_highest =  P(poverty_level=highest | not_funded)

label_neg_poverty_highest =  P(poverty_level=highest | funded)


### Poverty_level=Lower
label_pos_poverty_lower =  P(poverty_level=lower | not_funded)

label_neg_poverty_lower =  P(poverty_level=lower | funded)


In [10]:
label_pos_poverty_highest = traindf.loc[(train_attrdf['poverty_level']=='highest') & (traindf['quickstart_label'] > 0)]
label_neg_poverty_highest = traindf.loc[(train_attrdf['poverty_level']=='highest') & (traindf['quickstart_label'] < 1.0)]

In [11]:
label_pos_poverty_lower = traindf.loc[(train_attrdf['poverty_level']=='lower') & (traindf['quickstart_label'] > 0)]
label_neg_poverty_lower = traindf.loc[(train_attrdf['poverty_level']=='lower') & (traindf['quickstart_label'] <1.0)]

In [12]:
label_pos_poverty_highest.shape

(3196, 113)

In [13]:
label_neg_poverty_highest.shape

(6252, 113)

In [14]:
label_pos_poverty_lower.shape

(3130, 113)

In [15]:
label_neg_poverty_lower.shape

(4212, 113)

In [16]:
print('Highest Poverty training set prevalence:', len(label_pos_poverty_highest) / len(train_attrdf[train_attrdf['poverty_level']=='highest']))

Highest default training prevalence: 0.338272650296359


In [17]:
print('Lower Poverty training set prevalence:', len(label_pos_poverty_lower) / len(train_attrdf[train_attrdf['poverty_level']=='lower']))

Lower default training prevalence: 0.42631435576137294


## What type of disparities do we see in the data distribution here?


1.

2.

3.

4.


















...







## Let's now try resampling

We can perform three types of resampling:

1. Change the training data such that different poverty levels are distributed more uniformly but keep the distribution of labels the same within each poverty level P(poverty_level = highest) = P (poverty_level=lower)


2. Change the training data such that different poverty levels have more uniform label distributions P(poverty_level = highest | not funded ) = P(poverty_level=lower | not funded)


3. Change both


### Approach 2: Change the training data such that different poverty levels have more uniform label distributions 

In [37]:
n_pos_highest = 3000
n_neg_highest = 5000
print('Highest new training prevalence:', n_pos_highest / (n_pos_highest + n_neg_highest))

Highest new training prevalence: 0.375


In [39]:
n_pos_lower = 2500
n_neg_lower = 4000
print('Lower new training prevalence:', n_pos_lower / (n_pos_lower + n_neg_lower))

Lower new training prevalence: 0.38461538461538464


In [40]:
sample_pos_poverty_highest = label_pos_poverty_highest.sample(n=n_pos_highest, replace=False)
sample_neg_poverty_highest = label_neg_poverty_highest.sample(n=n_neg_highest, replace=False)

sample_pos_poverty_lower = label_pos_poverty_lower.sample(n=n_pos_lower, replace=False)
sample_neg_poverty_lower = label_neg_poverty_lower.sample(n=n_neg_lower, replace=False)

#### Rebuild model on resampled training data

In [41]:
new_traindf = pd.concat([sample_pos_poverty_highest,sample_neg_poverty_highest,sample_pos_poverty_lower, sample_neg_poverty_lower], axis=0)
y_train = new_traindf['quickstart_label'].values
rf.fit(new_traindf.drop(['entity_id','as_of_date','quickstart_label'], axis = 1), y_train)

RandomForestClassifier(max_depth=30, max_features='sqrt', min_samples_leaf=44,
                       min_samples_split=3, n_estimators=87, n_jobs=-1,
                       random_state=213500298)

#### Predict on the test set and calculate precision at 1000

In [42]:
y_pred = rf.predict_proba(testdf.drop(['entity_id','as_of_date','quickstart_label'], axis = 1))[:,1]
new_preds = testdf[['entity_id','as_of_date','quickstart_label']].copy()
new_preds['predict_proba'] = y_pred
new_preds = new_preds.sort_values('predict_proba', ascending = False).reset_index(drop=True).copy()
new_preds['score'] = new_preds.apply(lambda x: 1.0 if int(x.name)  < 1000 else 0.0, axis=1)
print('Model Precision: ', new_preds[new_preds['score'] > 0]['quickstart_label'].sum() / 1000)

Model Precision:  0.567


#### Audit for Bias  (keeping the attributes, reference groups, bias metric, and tolerance the same as before)

In [43]:
df = pd.merge(new_preds, test_attrdf, how='left', on=['entity_id','as_of_date'], left_index=True, right_index=False, sort=True, copy=True)
df = df.rename(columns = {'quickstart_label':'label_value'})
metrics = ['tpr']
g = Group()
xtab, _ = g.get_crosstabs(df[['score','label_value','poverty_level','metro_type', 'teacher_sex']].copy())
b = Bias()
bdf = b.get_disparity_predefined_groups(xtab, original_df=df, 
                                        ref_groups_dict={'poverty_level':'lower', 'metro_type':'suburban_rural', 'teacher_sex':'male'})

model_id, score_thresholds 0 {'rank_abs': [1000]}


#### Look at disparities and compare to the original version

In [45]:
ap.disparities(bdf, metrics, 'poverty_level', fairness_threshold = 1.3)

get_disparity_predefined_group()


In [26]:
ap.disparities(bdf, metrics, 'metro_type', fairness_threshold = 1.3)

In [27]:
ap.disparities(bdf, metrics, 'teacher_sex', fairness_threshold = 1.3)

In [None]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[['attribute_name', 'attribute_value'] + absolute_metrics]