In [None]:
from sklearn import svm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import shap

In [None]:
X_raw, Y = shap.datasets.adult()

In [None]:
A = X_raw[['Sex','Race']]
X = X_raw.drop(labels=['Sex', 'Race'],axis = 1)
X = pd.get_dummies(X)

In [None]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

le = LabelEncoder()
Y = le.fit_transform(Y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split(X_scaled, 
                                                    Y, 
                                                    A,
                                                    test_size = 0.2,
                                                    random_state=0,
                                                    stratify=Y)

# Work around indexing issue
X_train = X_train.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)

# Improve labels
A_test.Sex.loc[(A_test['Sex'] == 0)] = 'female'
A_test.Sex.loc[(A_test['Sex'] == 1)] = 'male'


A_test.Race.loc[(A_test['Race'] == 0)] = 'Amer-Indian-Eskimo'
A_test.Race.loc[(A_test['Race'] == 1)] = 'Asian-Pac-Islander'
A_test.Race.loc[(A_test['Race'] == 2)] = 'Black'
A_test.Race.loc[(A_test['Race'] == 3)] = 'Other'
A_test.Race.loc[(A_test['Race'] == 4)] = 'White'

In [None]:
lr_predictor = LogisticRegression(solver='liblinear', fit_intercept=True)

lr_predictor.fit(X_train, Y_train)
Y_pred_lr = lr_predictor.predict(X_test)

In [None]:
svm_predictor = svm.SVC()

svm_predictor.fit(X_train, Y_train)
Y_pred_svm = svm_predictor.predict(X_test)

# Sample APIs

In [None]:
from sklearn.metrics import accuracy_score, f1_score, fbeta_score
from fairlearn.metrics import group_summary, make_derived_metric, difference_from_summary, make_metric_group_summary
from fairlearn.metrics import demographic_parity_difference, balanced_accuracy_score_group_min
from fairlearn.metrics import false_negative_rate, false_positive_rate

## Report one disaggregated metric in a data frame

In [None]:
# Current
bunch = group_summary(accuracy_score, Y_test, Y_pred_lr, sensitive_features=A_test['Race'])
frame = pd.Series(bunch.by_group)
frame_o = pd.Series({**bunch.by_group, 'overall': bunch.overall})
print(frame)
print("=======================")
print(frame_o)

In [None]:
# Proposed
result = GroupedMetric(accuracy_score, Y_test, Y_pred_lr, sensitive_features=A_test['Race'])
frame = result.by_group
frame_o = result.to_df() # Throw if there is a group called 'overall'

## Report several disaggregated metrics in a data frame.

In [None]:
# Current
bunch1 = group_summary(accuracy_score, Y_test, Y_pred_lr, sensitive_features=A_test['Race'])
bunch2 = group_summary(f1_score, Y_test, Y_pred_lr, sensitive_features=A_test['Race'])
frame = pd.DataFrame({
   'accuracy': bunch1.by_group, 'f1': bunch2.by_group})
frame_o = pd.DataFrame({
   'accuracy': {**bunch1.by_group, 'overall': bunch1.overall},
   'f1': {**bunch2.by_group, 'overall': bunch2.overall}})

print(frame)
print("=======================")
print(frame_o)

In [None]:
# Proposed
result = GroupedMetric({ 'accuracy':accuracy_score, 'f1':f1_score}, Y_test, Y_pred_lr, sensitive_features=A_test['Race'])
frame = result.by_group
frame_o = result.to_df() # Throw if there is a group called 'overall'

## Report metrics for intersecting sensitive features

In [None]:
# Current
sf = A_test['Race']+'-'+A_test['Sex'] # User builds new column manually

bunch = group_summary(accuracy_score, Y_test, Y_pred_lr, sensitive_features=sf)
frame = pd.Series(bunch.by_group)
frame_o = pd.Series({**bunch.by_group, 'overall': bunch.overall})

print(frame)
print("=======================")
print(frame_o)

In [None]:
# Proposed
result = GroupedMetric(accuracy_score, Y_test, Y_pred_lr, sensitive_features=[A['Race'], A['Sex']])
frame = result.by_group # Will have a MultiIndex built from the two sensitive feature columns
frame_o = result.to_def() # Not sure how to handle adding the extra 'overall' row

## Report several performance and fairness metrics of several models in a data frame

In [None]:
# Current
fb_s = lambda y_t, y_p: fbeta_score(y_t, y_p, beta=0.5)
custom_difference1 = make_derived_metric(
    difference_from_summary,
    make_metric_group_summary(fb_s))

def custom_difference2(y_true, y_pred, sensitive_features):
    bunch = group_summary(fbeta_score, y_true, y_pred, sensitive_features=sensitive_features, beta=0.5)
    frame = pd.Series(bunch.by_group)
    return (frame-frame['White']).min()

fairness_metrics = {
    'Custom difference 1': custom_difference1,
    'Custom difference 2': custom_difference2,
    'Demographic parity difference': demographic_parity_difference,
    'Worst-case balanced accuracy': balanced_accuracy_score_group_min}
performance_metrics = {
    'FPR': false_positive_rate,
    'FNR': false_negative_rate}
predictions_by_estimator = {
    'logreg': Y_pred_lr,
    'svm': Y_pred_svm}

df = pd.DataFrame()
for pred_key, y_pred in predictions_by_estimator.items():
    for fairm_key, fairm in fairness_metrics.items():
        df.loc[fairm_key, pred_key] = fairm(Y_test, y_pred, sensitive_features=A_test['Race'])
    for perfm_key, perfm in performance_metrics.items():
        df.loc[perfm_key, pred_key] = perfm(Y_test, y_pred)
        
print(df)

In [None]:
# Proposed
custom_difference1 = make_derived_metric('difference', fbeta_score, parms={'beta', 0.5})

def custom_difference2(y_true, y_pred, sensitive_features):
    tmp = GroupedMetric(fbeta_score, y_true, y_pred, sensitive_features=sensitive_features, parms={'beta':0.5})
    return tmp.differences(relative_to='group', group='White', aggregate='min')

# The remainder as before

## Create a fairness-performance raster plot of several models

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Current
my_disparity_metric=custom_difference1
my_performance_metric=false_positive_rate

xs = [my_performance_metric(Y_test, y_pred) for y_pred in predictions_by_estimator.values()]
ys = [my_disparity_metric(Y_test, y_pred, sensitive_features=A_test['Race']) 
      for y_pred in predictions_by_estimator.values()]

plt.scatter(xs,ys)
plt.xlabel('Performance Metric')
plt.ylabel('Disparity Metric')
plt.show()

In [None]:
# Proposed

# Would also reuse the definition of custom_difference1

## Run sklearn.model_selection.cross_validate

Use demographic parity and precision score as the metrics

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score

In [None]:
# Current
precision_scorer = make_scorer(precision_score)

y_t = pd.Series(Y_test)
def dpd_wrapper(y_t, y_p, sensitive_features):
    # We need to slice up the sensitive feature to match y_t and y_p
    # See Adrin's reply to:
    # https://stackoverflow.com/questions/49581104/sklearn-gridsearchcv-not-using-sample-weight-in-score-function
    sf_slice = sensitive_features.loc[y_t.index.values].values.reshape(-1)
    return demographic_parity_difference(y_t, y_p, sensitive_features=sf_slice)
dp_scorer = make_scorer(dpd_wrapper, sensitive_features=A_test['Race'])

scoring = {'prec':precision_scorer, 'dp':dp_scorer}
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, X_test, y_t, scoring=scoring)
scores

In [None]:
# Proposed

# Would be the same, until Adrin's SLEP/PR are accepted to help with input slicing

### TASK 7: Run GridSearchCV

Use demographic parity and precision score where the goal is to find the lowest-error model whose demographic parity is <= 0.05.

In [None]:
# Current
from sklearn.model_selection import GridSearchCV

param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
scoring = {'prec':precision_scorer, 'dp':dp_scorer}

clf = svm.SVC(kernel='linear', C=1, random_state=0)

gscv = GridSearchCV(clf, param_grid=param_grid, scoring=scoring, refit='prec', verbose=1)
gscv.fit(X_test, y_t)

print("Best parameters set found on development set:")  
print(gscv.best_params_)
print("Best score:", gscv.best_score_)
print()
print("Overall results")
print(gscv.cv_results_)

In [None]:
# Proposed

# Would be the same, until Adrin's SLEP/PR are accepted to help with input slicing