In [69]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))
import yaml
import os
import pandas as pd
import numpy as np
import seaborn as sns
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
import aequitas.plot as ap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier


In [323]:
datadir = os.path.join(os.getcwd(),'data/donors-choose-splits/')
traindf = pd.read_csv(os.path.join(datadir,'train_20120501_20120801.csv.gz'), compression='gzip')
testdf = pd.read_csv(os.path.join(datadir,'test_20121201_20130201.csv.gz'), compression='gzip')
train_attrdf = pd.read_csv(os.path.join(datadir,'train_20120501_20120801_protected.csv.gz'), compression='gzip')
test_attrdf = pd.read_csv(os.path.join(datadir,'test_20121201_20130201_protected.csv.gz'), compression='gzip')


In [66]:
traindf.shape

(16790, 113)

In [36]:
train_attrdf['poverty_level'].value_counts()

highest    9448
lower      7342
Name: poverty_level, dtype: int64

In [70]:
datadir = os.path.join(os.getcwd(),'data/donors-choose/model_selection')
evals_df = pd.read_csv(os.path.join(datadir,'split2_evals.csv'))

In [71]:
evals_df[evals_df['model_uuid']=='a04e2eedd9c5ff18bcf77e84ae9db561']

Unnamed: 0,model_precision,model_classpath,hyperparameters,model_uuid,predictions_uuid,target_pp,matrix_type,matrix_start_date,matrix_end_date
0,0.552,sklearn.ensemble.RandomForestClassifier,"{""n_jobs"": -1, ""criterion"": ""gini"", ""max_depth...",a04e2eedd9c5ff18bcf77e84ae9db561,c598fbe93f4c218ac7d325fb478598f1,1000,test,2012-12-01,2013-01-31


In [72]:
import ast
hyperparameters= ast.literal_eval(evals_df['hyperparameters'][0])

In [73]:
rf = RandomForestClassifier(**hyperparameters)

In [74]:
rf.__dict__

{'base_estimator': DecisionTreeClassifier(),
 'n_estimators': 87,
 'estimator_params': ('criterion',
  'max_depth',
  'min_samples_split',
  'min_samples_leaf',
  'min_weight_fraction_leaf',
  'max_features',
  'max_leaf_nodes',
  'min_impurity_decrease',
  'min_impurity_split',
  'random_state',
  'ccp_alpha'),
 'bootstrap': True,
 'oob_score': False,
 'n_jobs': -1,
 'random_state': 213500298,
 'verbose': 0,
 'warm_start': False,
 'class_weight': None,
 'max_samples': None,
 'criterion': 'gini',
 'max_depth': 30,
 'min_samples_split': 3,
 'min_samples_leaf': 44,
 'min_weight_fraction_leaf': 0.0,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'ccp_alpha': 0.0}

In [333]:
label_pos_poverty_highest = traindf.loc[(train_attrdf['poverty_level']=='highest') & (traindf['quickstart_label'] > 0)]
label_neg_poverty_highest = traindf.loc[(train_attrdf['poverty_level']=='highest') & (traindf['quickstart_label'] < 1.0)]

In [334]:
label_pos_poverty_lower = traindf.loc[(train_attrdf['poverty_level']=='lower') & (traindf['quickstart_label'] > 0)]
label_neg_poverty_lower = traindf.loc[(train_attrdf['poverty_level']=='lower') & (traindf['quickstart_label'] <1.0)]

In [335]:
label_pos_poverty_highest.shape

(3196, 113)

In [337]:
label_neg_poverty_highest.shape

(6252, 113)

In [336]:
label_pos_poverty_lower.shape

(3130, 113)

In [338]:
label_neg_poverty_lower.shape

(4212, 113)

In [524]:
print('Highest default training prevalence:', len(label_pos_poverty_highest) / len(train_attrdf[train_attrdf['poverty_level']=='highest']))

Highest default training prevalence: 0.338272650296359


In [525]:
print('Lower default training prevalence:', len(label_pos_poverty_lower) / len(train_attrdf[train_attrdf['poverty_level']=='lower']))

Lower default training prevalence: 0.42631435576137294


In [550]:
n_pos_highest = 3000
n_neg_highest = 4000
print('Highest new training prevalence:', n_pos_highest / (n_pos_highest + n_neg_highest))

Highest new training prevalence: 0.42857142857142855


In [551]:
n_pos_lower = 3000
n_neg_lower = 4000
print('Lower new training prevalence:', n_pos_lower / (n_pos_lower + n_neg_lower))

Lower new training prevalence: 0.42857142857142855


In [552]:
sample_pos_poverty_highest = label_pos_poverty_highest.sample(n=n_pos_highest, replace=False)
sample_neg_poverty_highest = label_neg_poverty_highest.sample(n=n_neg_highest, replace=False)

sample_pos_poverty_lower = label_pos_poverty_lower.sample(n=n_pos_lower, replace=False)
sample_neg_poverty_lower = label_neg_poverty_lower.sample(n=n_neg_lower, replace=False)

In [553]:
new_traindf = pd.concat([sample_pos_poverty_highest,sample_neg_poverty_highest,sample_pos_poverty_lower, sample_neg_poverty_lower], axis=0)
y_train = new_traindf['quickstart_label'].values
rf.fit(new_traindf.drop(['entity_id','as_of_date','quickstart_label'], axis = 1), y_train)

RandomForestClassifier(max_depth=30, max_features='sqrt', min_samples_leaf=44,
                       min_samples_split=3, n_estimators=87, n_jobs=-1,
                       random_state=213500298)

In [554]:
y_pred = rf.predict_proba(testdf.drop(['entity_id','as_of_date','quickstart_label'], axis = 1))[:,1]
new_preds = testdf[['entity_id','as_of_date','quickstart_label']].copy()
new_preds['predict_proba'] = y_pred
new_preds = new_preds.sort_values('predict_proba', ascending = False).reset_index(drop=True).copy()
new_preds['score'] = new_preds.apply(lambda x: 1.0 if int(x.name)  < 1000 else 0.0, axis=1)
print('Model Precision: ', new_preds[new_preds['score'] > 0]['quickstart_label'].sum() / 1000)

Model Precision:  0.559


In [555]:
df = pd.merge(new_preds, test_attrdf, how='left', on=['entity_id','as_of_date'], left_index=True, right_index=False, sort=True, copy=True)
df = df.rename(columns = {'quickstart_label':'label_value'})
g = Group()
xtab, _ = g.get_crosstabs(df[['score','label_value','poverty_level','metro_type', 'teacher_sex']].copy())

model_id, score_thresholds 0 {'rank_abs': [1000]}


In [556]:
xtab[['attribute_name', 'attribute_value'] + absolute_metrics]

Unnamed: 0,attribute_name,attribute_value,tpr,tnr,for,fdr,fpr,fnr,npv,precision,ppr,pprev,prev
0,poverty_level,highest,0.086118,0.974494,0.298426,0.395034,0.025506,0.913882,0.701574,0.604966,0.443,0.04442,0.312043
1,poverty_level,lower,0.101819,0.945109,0.359172,0.477558,0.054891,0.898181,0.640828,0.522442,0.557,0.0723,0.370976
2,metro_type,suburban_rural,0.131216,0.92779,0.352672,0.486088,0.07221,0.868784,0.647328,0.513912,0.611,0.093913,0.367814
3,metro_type,unknown,0.126797,0.951975,0.352135,0.389937,0.048025,0.873203,0.647865,0.610063,0.159,0.077335,0.372082
4,metro_type,urban,0.052632,0.98699,0.299831,0.356522,0.01301,0.947368,0.700169,0.643478,0.23,0.025233,0.308502
5,teacher_sex,female,0.103555,0.957571,0.326355,0.441885,0.042429,0.896445,0.673645,0.558115,0.955,0.063274,0.341019
6,teacher_sex,male,0.031592,0.989211,0.313903,0.422222,0.010789,0.968408,0.686097,0.577778,0.045,0.017415,0.318498


In [557]:
b = Bias()
bdf = b.get_disparity_predefined_groups(xtab, original_df=df, ref_groups_dict={'poverty_level':'lower', 'metro_type':'suburban_rural', 'teacher_sex':'male'})
metrics = ['tpr']
ap.disparities(bdf, metrics, 'poverty_level', fairness_threshold = 1.3)

get_disparity_predefined_group()


In [12]:
df = df.sort_values('quickstart_label', ascending=False)

In [13]:
teacher_female_df = df[df['teacher_sex']=='female'].copy()

In [None]:
df_pov_high = df[df['poverty']]

In [14]:
teacher_male_df = df[df['teacher_sex']=='male'].copy()

In [15]:
teacher_female_df.head()

Unnamed: 0,entity_id,as_of_date,quickstart_label,metro_type,grade_level,poverty_level,teacher_sex
98745,171,1343775600000,1.0,urban,PreK-2,high,female
90235,243783,1340146800000,1.0,suburban,PreK-2,low_moderate,female
90222,243770,1340146800000,1.0,rural,9-12,high,female
90223,243771,1340146800000,1.0,urban,PreK-2,high,female
90232,243780,1340146800000,1.0,rural,PreK-2,high,female


In [31]:
teacher_female_df.shape


(14994, 8)

In [30]:
teacher_male_df.shape

(1796, 8)

In [16]:
teacher_male_df.head()

Unnamed: 0,entity_id,as_of_date,quickstart_label,metro_type,grade_level,poverty_level,teacher_sex
90229,243777,1340146800000,1.0,rural,3-5,high,male
90233,243781,1340146800000,1.0,unknown,3-5,high,male
90260,243740,1340146800000,1.0,urban,PreK-2,high,male
90013,243856,1340060400000,1.0,urban,6-8,high,male
90011,243854,1340060400000,1.0,unknown,9-12,high,male


In [17]:
df.shape

(16790, 7)

In [18]:
teacher_male_df['cumsum_prev'] = teacher_male_df['quickstart_label'].cumsum() / len(teacher_male_df)

In [19]:
teacher_female_df['cumsum_prev'] = teacher_female_df['quickstart_label'].cumsum() / len(teacher_female_df)

In [20]:
teacher_female_df.head()

Unnamed: 0,entity_id,as_of_date,quickstart_label,metro_type,grade_level,poverty_level,teacher_sex,cumsum_prev
98745,171,1343775600000,1.0,urban,PreK-2,high,female,6.7e-05
90235,243783,1340146800000,1.0,suburban,PreK-2,low_moderate,female,0.000133
90222,243770,1340146800000,1.0,rural,9-12,high,female,0.0002
90223,243771,1340146800000,1.0,urban,PreK-2,high,female,0.000267
90232,243780,1340146800000,1.0,rural,PreK-2,high,female,0.000333


In [21]:
prev_df = pd.concat([teacher_male_df, teacher_female_df], axis=0).sort_values('cumsum_prev', ascending=True)

In [22]:
prev_df.head()

Unnamed: 0,entity_id,as_of_date,quickstart_label,metro_type,grade_level,poverty_level,teacher_sex,cumsum_prev
98745,171,1343775600000,1.0,urban,PreK-2,high,female,6.7e-05
90235,243783,1340146800000,1.0,suburban,PreK-2,low_moderate,female,0.000133
90222,243770,1340146800000,1.0,rural,9-12,high,female,0.0002
90223,243771,1340146800000,1.0,urban,PreK-2,high,female,0.000267
90232,243780,1340146800000,1.0,rural,PreK-2,high,female,0.000333


In [25]:
new_prev = prev_df.head(10000).copy()

In [26]:
new_prev.tail(10)

Unnamed: 0,entity_id,as_of_date,quickstart_label,metro_type,grade_level,poverty_level,teacher_sex,cumsum_prev
90729,243015,1340492400000,0.0,unknown,3-5,high,female,0.384087
90730,243016,1340492400000,0.0,urban,PreK-2,high,female,0.384087
90731,243017,1340492400000,0.0,urban,PreK-2,high,female,0.384087
90732,243018,1340492400000,0.0,rural,9-12,high,female,0.384087
90733,243019,1340492400000,0.0,urban,3-5,high,female,0.384087
90748,243034,1340492400000,0.0,rural,6-8,high,female,0.384087
95567,238441,1342738800000,0.0,urban,9-12,high,female,0.384087
90694,243010,1340492400000,0.0,urban,PreK-2,high,female,0.384087
90632,243025,1340492400000,0.0,urban,9-12,high,female,0.384087
90740,243026,1340492400000,0.0,unknown,3-5,high,female,0.384087


In [27]:
new_prev['teacher_sex'].value_counts()

female    8204
male      1796
Name: teacher_sex, dtype: int64

In [None]:
new_prev.groupby()

In [131]:
new_preds = testdf[['entity_id','as_of_date','quickstart_label']].copy()