In [80]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.sklearn.metrics import disparate_impact_ratio, statistical_parity_difference
# from aif360.sklearn.preprocessing import Reweighing
# Explainers
from aif360.explainers import MetricTextExplainer, MetricJSONExplainer

from data_engineering import run_data_engineering


In [4]:
df = pd.read_csv('./data/original/student-por.csv', sep=';')
df = run_data_engineering(df)
df.sample(10)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,G2,G3,age_group,age_group_binary,sex_binary,famsize_binary,health_group,health_group_binary,grade_group,grade_group_binary
358,GP,F,18,U,LE3,T,4,3,health,services,...,12,15,GT17,1,0,1,bad,0,high,1
553,MS,M,17,U,LE3,A,1,0,other,other,...,11,12,LE17,0,1,1,good,1,low,0
69,GP,F,15,R,LE3,T,3,1,other,other,...,15,15,LE17,0,0,1,bad,0,high,1
67,GP,F,16,U,GT3,T,3,1,services,other,...,9,10,LE17,0,0,0,good,1,low,0
500,MS,M,17,U,GT3,T,1,2,other,other,...,7,7,LE17,0,1,0,bad,0,low,0
568,MS,M,19,U,GT3,T,3,2,at_home,services,...,11,9,GT17,1,1,0,bad,0,low,0
586,MS,F,17,U,GT3,T,4,2,teacher,services,...,8,0,LE17,0,0,0,good,1,low,0
551,MS,M,17,U,GT3,T,3,2,other,other,...,14,13,LE17,0,1,0,bad,0,high,1
95,GP,F,15,R,GT3,T,1,1,at_home,other,...,13,13,LE17,0,0,0,bad,0,high,1
535,MS,M,15,R,GT3,T,1,2,other,services,...,11,10,LE17,0,1,0,good,1,low,0


## Step 3

### Part 1

Based on your dataset, identify the privileged/unprivileged groups associated with each of your protected
class variables (i.e. convert to a binary classification problem) -- done as part of data engineering

### Parts 2 - 4
For each protected class variable, select two fairness metrics and compute the fairness metrics associated with your privileged/unprivileged groups as a function of your two dependent variables.

Fairness metrics selected:
* Disparate Impact: Computed as the ratio of rate of favorable outcome for the unprivileged group to that of the privileged group. The ideal value of this metric is 1.0 A value < 1 implies higher benefit for the privileged group and a value >1 implies a higher benefit for the unprivileged group. Fairness for this metric is between 0.8 and 1.2

* Statistical Parity Difference: Computed as the difference of the rate of favorable outcomes received by the unprivileged group to the privileged group. The ideal value of this metric is 0. Fairness for this metric is between -0.1 and 0.1

In [110]:
# Functions needed for analysis
def _make_dataset(df, label_name):
    dataset = BinaryLabelDataset(
        df=df[['sex_binary', 'age_group_binary', 'famsize_binary', 'health_group_binary', 'grade_group_binary']],
        label_names=[label_name],
        protected_attribute_names=['sex_binary', 'age_group_binary', 'famsize_binary']
    )
    return dataset


def _make_groups(var):
    privileged_groups = [{var: 1}]
    unprivileged_groups = [{var: 0}]
    return privileged_groups, unprivileged_groups


def _transform_dataset(dataset, unprivileged_groups, privileged_groups):
    weighter = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
    weighter.fit(dataset)
    transformed_dataset = weighter.transform(dataset)
    return transformed_dataset
    
    
def _get_metrics(df, var, label_name, transform=False):
    dataset = _make_dataset(df, label_name)
    privileged_groups, unprivileged_groups = _make_groups(var)
    
    if transform:
        privileged_groups_all = [{'sex_binary': 1, 'age_group_binary': 1, 'famsize_binary': 1}]
        unprivileged_groups_all = [{'sex_binary': 0, 'age_group_binary': 0, 'famsize_binary': 0}]
        dataset = _transform_dataset(dataset, unprivileged_groups_all, privileged_groups_all)
        
    metric = BinaryLabelDatasetMetric(
        dataset,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups
    )
    return metric

def _get_metric_df(df, var, label_name, transform=False):
    metrics = _get_metrics(df, var, label_name, transform)
    stat_par_diff = metrics.statistical_parity_difference()
    disp_impact = metrics.disparate_impact()
    return pd.DataFrame({
        'variable': [var],
        'label': [label_name],
        'stat_par_diff': [stat_par_diff],
        'disp_impact': [disp_impact]
    })

def generate_metrics(df, transform=False):
    results_dfs = []
    for var in ['sex_binary', 'age_group_binary', 'famsize_binary']:
        for label in ['health_group_binary', 'grade_group_binary']:
            results_dfs.append(_get_metric_df(df, var, label, transform))
    return pd.concat(results_dfs)
    

In [111]:
generate_metrics(df)

Unnamed: 0,variable,label,stat_par_diff,disp_impact
0,sex_binary,health_group_binary,-0.112625,0.817328
0,sex_binary,grade_group_binary,0.128183,1.36663
0,age_group_binary,health_group_binary,0.042629,1.082083
0,age_group_binary,grade_group_binary,0.045769,1.116679
0,famsize_binary,health_group_binary,0.004547,1.008315
0,famsize_binary,grade_group_binary,0.034407,1.085794


In [112]:
generate_metrics(df, transform=True)

Unnamed: 0,variable,label,stat_par_diff,disp_impact
0,sex_binary,health_group_binary,-0.109553,0.821661
0,sex_binary,grade_group_binary,0.06254,1.175284
0,age_group_binary,health_group_binary,0.046607,1.090317
0,age_group_binary,grade_group_binary,-0.012621,0.968666
0,famsize_binary,health_group_binary,0.008352,1.01536
0,famsize_binary,grade_group_binary,-0.024531,0.940309
