In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
from tqdm import tqdm
import os
import data_utils
import model_utils
from attack_utils import get_CSMIA_case_by_case_results, CSMIA_attack, LOMIA_attack
from data_utils import oneHotCatVars, filter_random_data_by_conf_score
from experiment_utils import MIAExperiment
from disparity_inference_utils import get_confidence_array, draw_confidence_array_scatter, get_indices_by_group_condition, get_corr_btn_sens_and_out_per_subgroup, get_slopes, get_angular_difference, calculate_stds, get_mutual_info_btn_sens_and_out_per_subgroup
from bcorr_utils import bcorr_sampling, evaluate, MLPClassifierFC
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network._base import ACTIVATIONS
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from fairlearn.metrics import equalized_odds_difference, demographic_parity_difference
import matplotlib.pyplot as plt
import seaborn as sns
import tabulate
import pickle
import copy

import matplotlib as mpl

In [4]:
experiments = {}

i = -0.4
j = -0.1
experiment = MIAExperiment(sampling_condition_dict = 
    {
            'subgroup_col_name': 'SEX',
            'n': 25000,
            'correlation_by_subgroup_values': [i, j],
    }, shortname = f"Corr_btn_sens_and_output_for_male_({i})_for_female_({j})", random_state = 0
)
experiments[experiment.name] = experiment

experiment_texas = MIAExperiment(sampling_condition_dict =
    {
            'subgroup_col_name': 'SEX_CODE',
            'n': 25000,
            'correlation_by_subgroup_values': [i, j],
    }, shortname = f"Corr_btn_sens_and_output_for_male_({i})_for_female_({j})", random_state = 0, name = "Texas100", sensitive_column = 'ETHNICITY'
)
experiments[experiment_texas.name] = experiment_texas

In [5]:
save_model=True
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f"Training classifier for experiment: {experiment}")
    try:
        experiment.clf = model_utils.load_model(f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model.pkl')
        print(f"Loaded classifier for experiment from file: {experiment}")
    except:
        base_model = model_utils.get_model(max_iter=500)
        experiment.clf = copy.deepcopy(base_model)
        experiment.clf.fit(experiment.X_train, experiment.y_tr_onehot)

        if save_model:
            model_utils.save_model(experiment.clf, f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model.pkl')

Training classifier for experiment: Census19_subgroup_col_name_SEX_n_25000_correlation_by_subgroup_values_[-0.4, -0.1]_rs0
Loaded classifier for experiment from file: Census19_subgroup_col_name_SEX_n_25000_correlation_by_subgroup_values_[-0.4, -0.1]_rs0
Training classifier for experiment: Texas100_subgroup_col_name_SEX_CODE_n_25000_correlation_by_subgroup_values_[-0.4, -0.1]_rs0
Loaded classifier for experiment from file: Texas100_subgroup_col_name_SEX_CODE_n_25000_correlation_by_subgroup_values_[-0.4, -0.1]_rs0


In [6]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    experiment.subgroup_col_name = experiment.sampling_condition_dict['subgroup_col_name']
    experiment.X_train_balanced_corr, experiment.y_tr_balanced_corr, experiment.y_tr_onehot_balanced_corr = bcorr_sampling(experiment, experiment.X_train, experiment.y_tr, experiment.y_tr_onehot, subgroup_col_name=experiment.subgroup_col_name)

100%|██████████| 2/2 [00:00<00:00, 23.02it/s]
100%|██████████| 2/2 [00:00<00:00, 25.01it/s]


In [7]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f"\nDataset: {experiment.name}, Subgroup: {experiment.subgroup_col_name}")
    experiment.subgroup_vals = [col.split('_')[-1] for col in experiment.X_train.columns if col.startswith(experiment.subgroup_col_name)]
    correlations_dict_before = {val: round(get_corr_btn_sens_and_out_per_subgroup(experiment, experiment.X_train, experiment.y_tr, {experiment.subgroup_col_name: val}), 2) for val in experiment.subgroup_vals}
    print(f"Correlations before balancing: {correlations_dict_before}")
    correlations_dict_after = {val: round(get_corr_btn_sens_and_out_per_subgroup(experiment, experiment.X_train_balanced_corr, experiment.y_tr_balanced_corr, {experiment.subgroup_col_name: val}), 2) for val in experiment.subgroup_vals}
    print(f"Correlations after balancing: {correlations_dict_after}")


Dataset: Census19, Subgroup: SEX
Correlations before balancing: {'0': -0.4, '1': -0.1}
Correlations after balancing: {'0': -0.1, '1': -0.1}

Dataset: Texas100, Subgroup: SEX_CODE
Correlations before balancing: {'0': -0.4, '1': -0.1}
Correlations after balancing: {'0': -0.1, '1': -0.1}


In [12]:
save_model=True
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f"Training classifier for experiment: {experiment}")
    try:
        experiment.clf_balanced_corr = model_utils.load_model(f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model_bcorr.pkl')
        print(f"Loaded classifier for experiment from file: {experiment}")
    except:
        base_model = model_utils.get_model(max_iter=500)
        experiment.clf_balanced_corr = copy.deepcopy(base_model)
        experiment.clf_balanced_corr.fit(experiment.X_train_balanced_corr, experiment.y_tr_balanced_corr)

        if save_model:
            model_utils.save_model(experiment.clf_balanced_corr, f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model_bcorr.pkl')

Training classifier for experiment: Census19_subgroup_col_name_SEX_n_25000_correlation_by_subgroup_values_[-0.4, -0.1]_rs0
Training classifier for experiment: Texas100_subgroup_col_name_SEX_CODE_n_25000_correlation_by_subgroup_values_[-0.4, -0.1]_rs0


In [14]:
from fairlearn.reductions import ExponentiatedGradient, DemographicParity, EqualizedOdds, ErrorRate

for experiment_key in experiments:
    experiment = experiments[experiment_key]

    if len(experiment.subgroup_vals) > 2:
        continue

    try:
        print(f"Loading mitigator for experiment: {experiment}")
        experiment.mitigator = model_utils.load_model(f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model_fairness_constraints.pkl')
    except:
        print(f"Training mitigator for experiment: {experiment}")
        clf2 = MLPClassifierFC(max_iter=500)
        clf2.coefs_ = experiment.clf.coefs_
        clf2.intercepts_ = experiment.clf.intercepts_
        constraint = EqualizedOdds()
        experiment.mitigator = ExponentiatedGradient(clf2, constraint)

        experiment.mitigator.fit(experiment.X_train, experiment.y_tr, sensitive_features=experiment.X_train[f'{experiment.subgroup_col_name}_0'])

        model_utils.save_model(experiment.mitigator, f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model_fairness_constraints.pkl')

Loading mitigator for experiment: Census19_subgroup_col_name_SEX_n_25000_correlation_by_subgroup_values_[-0.4, -0.1]_rs0
Loading mitigator for experiment: Texas100_subgroup_col_name_SEX_CODE_n_25000_correlation_by_subgroup_values_[-0.4, -0.1]_rs0


In [16]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    res_dict = {
        'w/o BCorr': evaluate(experiment, experiment.clf, experiment.X_train, experiment.y_tr, experiment.X_test, experiment.y_te, subgroup_col_name=experiment.subgroup_col_name),
        'w Bcorr': evaluate(experiment, experiment.clf_balanced_corr, experiment.X_train_balanced_corr, experiment.y_tr_balanced_corr, experiment.X_test, experiment.y_te, subgroup_col_name=experiment.subgroup_col_name),
        'FC': evaluate(experiment, experiment.mitigator, experiment.X_train, experiment.y_tr, experiment.X_test, experiment.y_te, subgroup_col_name=experiment.subgroup_col_name) if len(experiment.subgroup_vals) == 2 else None
    }
    res_dict_df = pd.DataFrame.from_dict(res_dict, orient='index')
    print(f"Dataset: {experiment.name}, Subgroup: {experiment.subgroup_col_name}")
    print(res_dict_df)

Dataset: Census19, Subgroup: SEX
           ASRD_CSMIA  ASRD_LOMIA     EOD     DPD      MA
w/o BCorr       11.80       14.65  0.0726  0.1284  73.904
w Bcorr          0.42        1.01  0.0438  0.0914  73.770
FC               8.94       13.97  0.0566  0.1059  70.598
Dataset: Texas100, Subgroup: SEX_CODE
           ASRD_CSMIA  ASRD_LOMIA     EOD     DPD      MA
w/o BCorr       12.91       15.45  0.1768  0.1007  72.080
w Bcorr          1.31        0.87  0.0120  0.0701  74.624
FC              11.65       11.65  0.0938  0.0243  70.708
