In [None]:
!pip install aequitas

import yaml
import os
import pandas as pd
import numpy as np
import seaborn as sns
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
import aequitas.plot as ap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
DATAPATH = 'https://github.com/dssg/fairness_tutorial/raw/master/data/'

In [None]:
traindf = pd.read_csv(DATAPATH + 'train_20120501_20120801.csv.gz', compression='gzip')
testdf = pd.read_csv(DATAPATH + 'test_20121201_20130201.csv.gz', compression='gzip')
train_attrdf = pd.read_csv(DATAPATH + 'train_20120501_20120801_protected.csv.gz', compression='gzip')
test_attrdf = pd.read_csv(DATAPATH + 'test_20121201_20130201_protected.csv.gz', compression='gzip')


In [None]:
traindf.shape

In [None]:
train_attrdf['poverty_level'].value_counts()

In [None]:
datadir = os.path.join(os.getcwd(),'data/donors-choose/model_selection')
evals_df = pd.read_csv(os.path.join(datadir,'split2_evals.csv'))

In [None]:
evals_df[evals_df['model_uuid']=='a04e2eedd9c5ff18bcf77e84ae9db561']

In [None]:
import ast
hyperparameters= ast.literal_eval(evals_df['hyperparameters'][0])

In [None]:
rf = RandomForestClassifier(**hyperparameters)

In [None]:
rf.__dict__

In [None]:
label_pos_poverty_highest = traindf.loc[(train_attrdf['poverty_level']=='highest') & (traindf['quickstart_label'] > 0)]
label_neg_poverty_highest = traindf.loc[(train_attrdf['poverty_level']=='highest') & (traindf['quickstart_label'] < 1.0)]

In [None]:
label_pos_poverty_lower = traindf.loc[(train_attrdf['poverty_level']=='lower') & (traindf['quickstart_label'] > 0)]
label_neg_poverty_lower = traindf.loc[(train_attrdf['poverty_level']=='lower') & (traindf['quickstart_label'] <1.0)]

In [None]:
label_pos_poverty_highest.shape

In [None]:
label_neg_poverty_highest.shape

In [None]:
label_pos_poverty_lower.shape

In [None]:
label_neg_poverty_lower.shape

In [None]:
print('Highest default training prevalence:', len(label_pos_poverty_highest) / len(train_attrdf[train_attrdf['poverty_level']=='highest']))

In [None]:
print('Lower default training prevalence:', len(label_pos_poverty_lower) / len(train_attrdf[train_attrdf['poverty_level']=='lower']))

In [None]:
n_pos_highest = 3000
n_neg_highest = 4000
print('Highest new training prevalence:', n_pos_highest / (n_pos_highest + n_neg_highest))

In [None]:
n_pos_lower = 3000
n_neg_lower = 4000
print('Lower new training prevalence:', n_pos_lower / (n_pos_lower + n_neg_lower))

In [None]:
sample_pos_poverty_highest = label_pos_poverty_highest.sample(n=n_pos_highest, replace=False)
sample_neg_poverty_highest = label_neg_poverty_highest.sample(n=n_neg_highest, replace=False)

sample_pos_poverty_lower = label_pos_poverty_lower.sample(n=n_pos_lower, replace=False)
sample_neg_poverty_lower = label_neg_poverty_lower.sample(n=n_neg_lower, replace=False)

In [None]:
new_traindf = pd.concat([sample_pos_poverty_highest,sample_neg_poverty_highest,sample_pos_poverty_lower, sample_neg_poverty_lower], axis=0)
y_train = new_traindf['quickstart_label'].values
rf.fit(new_traindf.drop(['entity_id','as_of_date','quickstart_label'], axis = 1), y_train)

In [None]:
y_pred = rf.predict_proba(testdf.drop(['entity_id','as_of_date','quickstart_label'], axis = 1))[:,1]
new_preds = testdf[['entity_id','as_of_date','quickstart_label']].copy()
new_preds['predict_proba'] = y_pred
new_preds = new_preds.sort_values('predict_proba', ascending = False).reset_index(drop=True).copy()
new_preds['score'] = new_preds.apply(lambda x: 1.0 if int(x.name)  < 1000 else 0.0, axis=1)
print('Model Precision: ', new_preds[new_preds['score'] > 0]['quickstart_label'].sum() / 1000)

In [None]:
df = pd.merge(new_preds, test_attrdf, how='left', on=['entity_id','as_of_date'], left_index=True, right_index=False, sort=True, copy=True)
df = df.rename(columns = {'quickstart_label':'label_value'})
g = Group()
xtab, _ = g.get_crosstabs(df[['score','label_value','poverty_level','metro_type', 'teacher_sex']].copy())

In [None]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[['attribute_name', 'attribute_value'] + absolute_metrics]

In [None]:
b = Bias()
bdf = b.get_disparity_predefined_groups(xtab, original_df=df, ref_groups_dict={'poverty_level':'lower', 'metro_type':'suburban_rural', 'teacher_sex':'male'})
metrics = ['tpr']
ap.disparities(bdf, metrics, 'poverty_level', fairness_threshold = 1.3)

In [None]:
ap.disparities(bdf, metrics, 'metro_type', fairness_threshold = 1.3)

In [None]:
ap.disparities(bdf, metrics, 'teacher_sex', fairness_threshold = 1.3)