In [None]:
%load_ext autoreload
%autoreload 2
%aimport logistic_regression_model
%aimport load_dataset
%aimport fairness_metrics

import numpy as np
from sklearn.linear_model import LogisticRegression
import disparate_impact_remover
from IPython.display import Markdown, display
from tqdm import tqdm

In [None]:
# INPUT PARAMS
LABEL_COL, PROTECT_COLS, MODE, START_EPOCH, NUM_EPOCH, ID, NUM_TRIALS, NUM_PROXIES, FILE_PATH, VERBOSE, \
LR_RATE, UPDATE, WEIGHTS_INIT, UPDATE_LR, BATCH_SIZE, BALANCE = "income", ["gender"], 0, 0, 40, -1, 1, 0, \
                                                                "../Datasets/adult_dataset/processed_adult.csv", 1, \
                                                                0.001, "cluster", 0, 10, 1000, 0

#### Computes and split the dataset into train, validation and test set


In [None]:

balanced = {"train_label_only": True, "test_label_only": False, "downsample": True} if BALANCE else None


df = load_dataset.get_data(FILE_PATH)
df = load_dataset.minmax_scale(df)
protected_index = df.columns.tolist().index(PROTECT_COLS[0])


train_df, test_df = load_dataset.split_train_test(df, train=0.75)

if balanced is not None:
    train_df = load_dataset.balance_df(df, LABEL_COL, PROTECT_COLS, label_only=balanced["train_label_only"],
                            downsample=balanced["downsample"])
    test_df = load_dataset.balance_df(df, LABEL_COL, PROTECT_COLS, label_only=balanced["test_label_only"],
                            downsample=balanced["downsample"])

# Splitting dataset into train, test features
print("Statistics")
load_dataset.statistics(train_df, LABEL_COL, PROTECT_COLS, verbose=1)
load_dataset.statistics(test_df, LABEL_COL, PROTECT_COLS, verbose=1)

train_dataset = load_dataset.Dataset(train_df, LABEL_COL, PROTECT_COLS)
test_dataset = load_dataset.Dataset(test_df, LABEL_COL, PROTECT_COLS)


print("---------- MAPPING ----------")
print("Train: ", train_dataset.mapping)
print("Test: ", test_dataset.mapping)
print("-----------------------------")


In [None]:
favorable_label, unfavorable_label = 1, 0

In [None]:
accs, avg_odd_difference = [], []
for level in tqdm(np.linspace(0., 1., 11)):
    di = disparate_impact_remover.DisparateImpactRemover(repair_level=level)
    train_repd = di.fit_transform(train_dataset, protected_index)
    test_repd = di.fit_transform(test_dataset, protected_index)
    
    X_tr = np.delete(train_repd.features, protected_index, axis=1)
    X_te = np.delete(test_repd.features, protected_index, axis=1)
    y_tr = train_repd.label.ravel()
    print(f"{np.sum((train_dataset.features != train_repd.features)).reshape(-1)/len(train_dataset.features.reshape(-1))*100}% training set changed")
    print(f"{np.sum((test_dataset.features != test_repd.features)).reshape(-1)/len(test_dataset.features.reshape(-1))*100}% training set changed")
    
    lmod = LogisticRegression(class_weight='balanced', solver='liblinear')
    lmod.fit(X_tr, y_tr)
    
    test_repd_pred = test_repd.copy()
    test_repd_pred.label = lmod.predict(X_te)
    
    print()

    acc = np.sum(test_repd_pred.label.reshape(-1) == test_repd.label.reshape(-1))/len(test_dataset.label)
    odds = fairness_metrics.equalizing_odds(test_repd_pred.label, test_repd.label,
                                                           test_repd.protect)
    diffs = [max(odd) - min(odd) for odd in odds]
    print(f"Accuracy: {acc*100}%")
    print(f"Equalizing Odds: {odds}")
    print(f"Weighted average odds difference", np.average(diffs, weights=[np.sum(test_repd_pred.label == unfavorable_label), np.sum(test_repd_pred.label == favorable_label)]))
    print(f"Before - disparate impact: {disparate_impact(test_repd.label, test_repd.protect, 0, 1)}")
    print(f"After - disparate impact: {disparate_impact(test_repd_pred.label, test_repd_pred.protect, 0, 1)}")

In [None]:
def disparate_impact(predictions, group, privilege, favorable):
    p, up, p_f, up_f = 0, 0, 0, 0
    for pred, g in zip(predictions, group):
        if g == privilege:
            if pred == favorable:
                p_f += 1
            p += 1
        else:
            if pred == favorable:
                up_f += 1
            up += 1
    return (up_f/up)/(p_f/p)