In [None]:
!pip install aequitas

import yaml
import os
import pandas as pd
import numpy as np
import seaborn as sns
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
#import aequitas.plot as ap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
sns.set() 
DATAPATH = 'https://github.com/dssg/fairness_tutorial/raw/master/data/'

In [None]:
split1_traindf = pd.read_csv(DATAPATH + 'train_20111101_20120201.csv.gz', compression='gzip')
split1_testdf = pd.read_csv(DATAPATH + 'test_20120601_20120801.csv.gz', compression='gzip')
split1_attrdf = pd.read_csv(DATAPATH + 'test_20120601_20120801_protected.csv.gz', compression='gzip')


In [None]:
split1_traindf.head()

In [None]:
evals_df = pd.read_csv(DATAPATH + 'split2_evals.csv.gz', compression='gzip')

In [None]:
evals_df[evals_df['model_uuid']=='a04e2eedd9c5ff18bcf77e84ae9db561']

In [None]:
import ast
hyperparameters= ast.literal_eval(evals_df['hyperparameters'][0])

In [None]:
hyperparameters

In [None]:
rf = RandomForestClassifier(**hyperparameters)

In [None]:
rf.__dict__

In [None]:
split1_traindf.shape

In [None]:
y_train = split1_traindf['quickstart_label'].values

In [None]:
y_train

In [None]:
rf.fit(split1_traindf.drop(['entity_id','as_of_date','quickstart_label'], axis = 1), y_train)

In [None]:
y_pred = rf.predict_proba(split1_testdf.drop(['entity_id','as_of_date','quickstart_label'], axis = 1))[:,1]

In [None]:
split1_preds = split1_testdf[['entity_id','as_of_date','quickstart_label']].copy()

In [None]:
split1_preds['predict_proba'] = y_pred

In [None]:
split1_preds.head()

In [None]:
split1_attrdf.head(10)

In [None]:
df = pd.merge(split1_preds, split1_attrdf, how='left', on=['entity_id','as_of_date'], left_index=True, right_index=False, sort=True, copy=True)

In [None]:
df = df.sort_values('predict_proba', ascending=False)
poverty_highest_df = df[df['poverty_level']=='highest'].copy()
poverty_lower_df = df[df['poverty_level']=='lower'].copy()

In [None]:
poverty_highest_df.head()

In [None]:
poverty_lower_df.head()

In [None]:
poverty_highest_df['cumsum_recall'] = poverty_highest_df['quickstart_label'].cumsum() / poverty_highest_df['quickstart_label'].sum()

poverty_lower_df['cumsum_recall'] = poverty_lower_df['quickstart_label'].cumsum() / poverty_lower_df['quickstart_label'].sum()

recall_df = pd.concat([poverty_highest_df, poverty_lower_df], axis=0).sort_values('cumsum_recall', ascending=True)

In [None]:
new_pp = recall_df.head(1000).copy()
new_pp.tail()

In [None]:
new_pp['poverty_level'].value_counts()

Now we have calculated the number of predicted positives for each group, we can apply these to the most recent split.

In [None]:
split2_preds = pd.read_csv(DATAPATH + 'predictions_c598fbe93f4c218ac7d325fb478598f1.csv.gz', compression='gzip')
split2_attrdf = pd.read_csv(DATAPATH + 'test_20121201_20130201_protected.csv.gz', compression='gzip')


In [None]:
df2 = pd.merge(split2_preds, split2_attrdf, how='left', on=['entity_id','as_of_date'], left_index=True, right_index=False, sort=True, copy=True)

In [None]:
df2.head()

In [None]:
df2 = df2.sort_values('predict_proba', ascending=False)
poverty_highest_df2 = df2[df2['poverty_level']=='highest'].copy()
poverty_lower_df2 = df2[df2['poverty_level']=='lower'].copy()

In [None]:
poverty_df_highest_pp = poverty_highest_df2.head(554)
poverty_df_lower_pp = poverty_lower_df2.head(446)
new_pp2 = pd.concat([poverty_df_highest_pp, poverty_df_lower_pp], axis=0).sort_values('predict_proba', ascending=True)
new_pp2.tail()

In [None]:
new_pp2['quickstart_label'].sum() / len(new_pp2)

In [None]:
split2_preds.sort_values('predict_proba', ascending = False).head(1000)['quickstart_label'].sum() / 1000

In [None]:
new_pp2.shape

In [None]:
fixed_df2 = df2.copy()
fixed_df2['score'] = fixed_df2.apply(lambda x: 1.0 if x.name in new_pp2.index.tolist() else 0, axis=1)

In [None]:
fixed_df2= fixed_df2.rename(columns = {'quickstart_label':'label_value'})

In [None]:
fixed_df2

In [None]:
g = Group()
xtab, _ = g.get_crosstabs(fixed_df2[['score','label_value','poverty_level','metro_type', 'teacher_sex']].copy())

In [None]:
absolute_metrics = g.list_absolute_metrics(xtab)

In [None]:
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

In [None]:
xtab[['attribute_name', 'attribute_value'] + absolute_metrics]

In [None]:
b = Bias()
bdf = b.get_disparity_predefined_groups(xtab, original_df=df, ref_groups_dict={'poverty_level':'lower', 'metro_type':'suburban_rural', 'teacher_sex':'male'})

In [None]:
metrics = ['tpr']

In [None]:
ap.disparities(bdf, metrics, 'poverty_level', fairness_threshold = 1.3)

In [None]:
ap.disparities(bdf, metrics, 'metro_type', fairness_threshold = 1.3)

In [None]:
ap.disparities(bdf, metrics, 'teacher_sex', fairness_threshold = 1.3)