In [None]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

# Load the data using the tempeh package
from tempeh.configurations import datasets
dataset = datasets['adult_uci']()

X_train, X_test = dataset.get_X(format=pd.DataFrame)
A_train, A_test = X_train.get(key='Race'), X_test.get(key='Race')
X_train, X_test = X_train.drop(columns=['Race','Sex']), X_test.drop(columns=['Race','Sex'])

y_train, y_test = dataset.get_y(format=pd.Series)

# Combine all training data into a single data frame and glance at a few rows
all_train = pd.concat([X_train, y_train, A_train], axis=1)
display(all_train)

In [None]:
from sklearn.linear_model import LogisticRegression

unmitigated_predictor = LogisticRegression(solver='liblinear', fit_intercept=True)
unmitigated_predictor.fit(X_train, y_train)

In [None]:
from fairlearn.metrics import group_roc_auc_score

# a convenience function that transforms the result of a group metric call into a data frame
def group_metric_as_df(name, group_metric_result):
    a = pd.Series(group_metric_result.by_group)
    a['overall'] = group_metric_result.overall
    return pd.DataFrame({name: a})

scores_unmitigated = pd.Series(unmitigated_predictor.predict_proba(X_test)[:,1], name="score_unmitigated")
auc_unmitigated = group_metric_as_df("auc_unmitigated",
                                     group_roc_auc_score(y_test, scores_unmitigated, A_test))

display(HTML('<span id="auc_unmitigated">'),
        auc_unmitigated,
        HTML('</span>'))

In [None]:
def get_error(y,predicted_y):
    correct_y = (y == predicted_y)
    return 1 - sum(correct_y)/len(correct_y)

In [None]:
unmitigated_y = pd.Series(unmitigated_predictor.predict(X_test),name="unmitigated_predicted_y")
error_unmitigated = [get_error(y_test,unmitigated_y)]
print("The error for unmitigated is:")
print(error_unmitigated)

In [None]:
def get_violation(predict_y,A_test,label_name):
    violations = []
    predicted_and_sensitiveFeature = pd.concat([predict_y,A_test],axis=1)
    grouped =predicted_and_sensitiveFeature.groupby('Race')
    counts_by_race = grouped[[label_name]].count()
    passed_by_race = grouped[[label_name]].sum()
#     display(counts_by_race)
#     display(passed_by_race)
    for i,group in enumerate(grouped.groups.keys()):
        violation_1 = passed_by_race[label_name][i] / counts_by_race[label_name][i]
        violation_2 = sum(predict_y) / len(predict_y)
        violations.append(abs(violation_1 - violation_2))
    violation = max(violations)
    return violation
    
violation_unmitigated = [get_violation(unmitigated_y,A_test,'unmitigated_predicted_y')]
print("The violation for unmitigated is:")
print(violation_unmitigated)

In [None]:
from fairlearn.reductions import ExponentiatedGradient
from fairlearn.reductions import GridSearch, DemographicParity
import numpy as np
eps_list = [0.001,0.002,0.003,0.005,0.007,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]
# eps_list = list(np.arange(0.001, 0.1, 0.001))
# eps_list = [0.01]
expgrad_error = []
expgrad_violation = []

for eps in eps_list:
    expgrad_X = ExponentiatedGradient(
    LogisticRegression(solver='liblinear', fit_intercept=True),
    constraints=DemographicParity(),
    eps=eps,
    nu=1e-6)
    
    expgrad_X.fit(
    X_train,
    y_train,
    sensitive_features=A_train)
    
    expgrad_y = pd.Series(expgrad_X.predict(X_train),name="expgrad_predicted_y")
    error_expgrad = get_error(y_train,expgrad_y)
    expgrad_error.append(error_expgrad)
    violation_expgrad = get_violation(expgrad_y,A_train,"expgrad_predicted_y")
    expgrad_violation.append(violation_expgrad)


In [None]:
import matplotlib.pyplot as plt

plt.scatter(expgrad_violation,expgrad_error,label="expgrad")
plt.plot(violation_unmitigated,error_unmitigated,'ro',label="unmitigated")
plt.xlabel('Violation of the fairness constraint')
plt.ylabel('Error')
plt.title('Adult Uci/DP/log.reg')
plt.legend()
plt.show()