In [None]:
import pandas as pd
import seaborn as sns
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot

import warnings; warnings.simplefilter('ignore')
from triage.component.catwalk.db import connect

%matplotlib inline
pd.options.display.max_columns = None

In [None]:
query = "select entity_id::text, score::float4, label_value::int, model_id::int, gender::text, rural::text, ovg_bool::text from (select * from test_results.predictions where model_id = 303) as pred_303 left join (select max(ovg_bool) as ovg_bool, entity_id as ovg_id from features.overage_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as ovg on entity_id = ovg_id left join (select max(gender) as gender, entity_id as gen_id from features.gender_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as gen on entity_id = gen_id left join (select case when max(rural) = 'rural' then 'rural' when max(rural) LIKE 'urban_' then 'urban' else 'unknown' end as rural, entity_id as rur_id from features.rural_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as rur on entity_id = rur_id"

engine = connect()
model_info = engine.execute(query)
data = model_info.fetchall()
    

In [None]:
df = pd.DataFrame(data)
df.head()

Get Crosstabs

In [None]:
df.rename(index=str, columns = {0:'entity_id', 1: 'score', 2: 'label_value', 3: 'model_id', 
                                4: 'gender', 5: 'rural', 6: 'ovg_bool'}, inplace = True)

In [None]:
df['gender'] = df['gender'].astype(str)
df['rural'] = df['rural'].astype(str)
df['ovg_bool'] = df['ovg_bool'].astype(str)

In [None]:
df.to_csv('aequitas_test.csv', index=False)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
g = Group()
xtab, _ = g.get_crosstabs(df, {'rank_pct':[.1]})

In [None]:
xtab

In [None]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

In [None]:
xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2)

In [None]:
aqp = Plot()


In [None]:
fnr = aqp.plot_group_metric(xtab, 'fnr')


In [None]:
p = aqp.plot_group_metric_all(xtab, metrics=['ppr','pprev','fnr','fpr'], ncols=4)


In [None]:
p.savefig('metric_all.png')

In [None]:
b = Bias()
bdf = b.get_disparity_predefined_groups(xtab, original_df=df, ref_groups_dict={'rural':'urban', 'gender':'m', 'ovg_bool':'0'}, alpha=0.05, mask_significance=False)
calculated_disparities = b.list_disparities(bdf)
disparity_significance = b.list_significance(bdf)
bdf[['attribute_name', 'attribute_value'] +  calculated_disparities + disparity_significance]

In [None]:
disparity_significance

### Model 292

In [None]:
query = "select entity_id::text, score::float4, label_value::int, model_id::int, gender::text, rural::text, ovg_bool::text from (select * from test_results.predictions where model_id = 292) as pred_292 left join (select max(ovg_bool) as ovg_bool, entity_id as ovg_id from features.overage_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as ovg on entity_id = ovg_id left join (select max(gender) as gender, entity_id as gen_id from features.gender_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as gen on entity_id = gen_id left join (select case when max(rural) = 'rural' then 'rural' when max(rural) LIKE 'urban_' then 'urban' else 'unknown' end as rural, entity_id as rur_id from features.rural_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as rur on entity_id = rur_id"

engine = connect()
model_info = engine.execute(query)
data = model_info.fetchall()
    

df = pd.DataFrame(data)
df.rename(index=str, columns = {0:'entity_id', 1: 'score', 2: 'label_value', 3: 'model_id', 
                                4: 'gender', 5: 'rural', 6: 'ovg_bool'}, inplace = True)

df['gender'] = df['gender'].astype(str)
df['rural'] = df['rural'].astype(str)
df['ovg_bool'] = df['ovg_bool'].astype(str)

In [None]:
g = Group()
xtab, _ = g.get_crosstabs(df, {'rank_pct':[.1]})

xtab



In [None]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2)

In [None]:
b = Bias()
bdf = b.get_disparity_predefined_groups(xtab, original_df=df, ref_groups_dict={'rural':'urban', 'gender':'m', 'ovg_bool':'0'}, alpha=0.05, mask_significance=False)
calculated_disparities = b.list_disparities(bdf)
disparity_significance = b.list_significance(bdf)
bdf[['attribute_name', 'attribute_value'] +  calculated_disparities + disparity_significance]

### Model 160

In [None]:
query = "select entity_id::text, score::float4, label_value::int, model_id::int, gender::text, rural::text, ovg_bool::text from (select * from test_results.predictions where model_id = 160) as pred_160 left join (select max(ovg_bool) as ovg_bool, entity_id as ovg_id from features.overage_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as ovg on entity_id = ovg_id left join (select max(gender) as gender, entity_id as gen_id from features.gender_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as gen on entity_id = gen_id left join (select case when max(rural) = 'rural' then 'rural' when max(rural) LIKE 'urban_' then 'urban' else 'unknown' end as rural, entity_id as rur_id from features.rural_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as rur on entity_id = rur_id"

engine = connect()
model_info = engine.execute(query)
data = model_info.fetchall()
    

df = pd.DataFrame(data)
df.rename(index=str, columns = {0:'entity_id', 1: 'score', 2: 'label_value', 3: 'model_id', 
                                4: 'gender', 5: 'rural', 6: 'ovg_bool'}, inplace = True)

df['gender'] = df['gender'].astype(str)
df['rural'] = df['rural'].astype(str)
df['ovg_bool'] = df['ovg_bool'].astype(str)

In [None]:
g = Group()
xtab, _ = g.get_crosstabs(df, {'rank_pct':[.1]})

xtab


In [None]:
g = Group()
xtab, _ = g.get_crosstabs(df, {'rank_pct':[10]})

xtab


In [None]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2)

In [None]:
b = Bias()
bdf = b.get_disparity_predefined_groups(xtab, original_df=df, ref_groups_dict={'rural':'urban', 'gender':'m', 'ovg_bool':'0'}, alpha=0.05, mask_significance=False)
calculated_disparities = b.list_disparities(bdf)
disparity_significance = b.list_significance(bdf)
bdf[['attribute_name', 'attribute_value'] +  calculated_disparities + disparity_significance]

### Model 34 (baseline)

In [None]:
query = "select entity_id::text, score::float4, label_value::int, model_id::int, gender::text, rural::text, ovg_bool::text from (select * from test_results.predictions where model_id = 34) as pred_34 left join (select max(ovg_bool) as ovg_bool, entity_id as ovg_id from features.overage_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as ovg on entity_id = ovg_id left join (select max(gender) as gender, entity_id as gen_id from features.gender_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as gen on entity_id = gen_id left join (select case when max(rural) = 'rural' then 'rural' when max(rural) LIKE 'urban_' then 'urban' else 'unknown' end as rural, entity_id as rur_id from features.rural_from_obj where event_date = to_date('20160101', 'YYYYMMDD') group by entity_id) as rur on entity_id = rur_id"

engine = connect()
model_info = engine.execute(query)
data = model_info.fetchall()
    

df = pd.DataFrame(data)
df.rename(index=str, columns = {0:'entity_id', 1: 'score', 2: 'label_value', 3: 'model_id', 
                                4: 'gender', 5: 'rural', 6: 'ovg_bool'}, inplace = True)

df['gender'] = df['gender'].astype(str)
df['rural'] = df['rural'].astype(str)
df['ovg_bool'] = df['ovg_bool'].astype(str)

In [None]:
g = Group()
xtab, _ = g.get_crosstabs(df, {'rank_pct':[.1]})

xtab


In [None]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2)

In [None]:
b = Bias()
bdf = b.get_disparity_predefined_groups(xtab, original_df=df, ref_groups_dict={'rural':'urban', 'gender':'m', 'ovg_bool':'0'}, alpha=0.05, mask_significance=False)
calculated_disparities = b.list_disparities(bdf)
disparity_significance = b.list_significance(bdf)
bdf[['attribute_name', 'attribute_value'] +  calculated_disparities + disparity_significance]