# 1. Install and import libraries needed

In [1]:
!pip install azureml-contrib-fairness --q
!pip install fairlearn --q

In [2]:
from sklearn.model_selection import train_test_split
from fairlearn.widget import FairlearnDashboard
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np
import shap

{4: 27816, 2: 3124, 1: 1039, 0: 311, 3: 271}

# 2. Load the dataset

In [None]:
# Load the census dataset
X_raw, Y = shap.datasets.adult()
X_raw["Race"].value_counts().to_dict()

In [3]:
X_raw.head(2)

Unnamed: 0,Age,Workclass,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
0,39.0,7,13.0,4,1,0,4,1,2174.0,0.0,40.0,39
1,50.0,6,13.0,2,4,4,4,1,0.0,0.0,13.0,39


In [5]:
np.unique(Y, return_counts=True)

(array([False,  True]), array([24720,  7841]))

We can see the class is unbalanced and in the majority of the cases, people is denied to get a loan

## 2.1 Some feature transformation

In [9]:
# (Optional) Separate the "sex" and "race" sensitive features out and drop them from the main data prior to training your model
A = X_raw[['Sex','Race']]
X = X_raw.drop(labels=['Sex', 'Race'],axis = 1)
X = pd.get_dummies(X)

sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Perform some standard data preprocessing steps to convert the data into a format suitable for the ML algorithms
le = LabelEncoder()
Y = le.fit_transform(Y)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## 2.2 Split the dataset into train and test

In [None]:
# Split data into train and test
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split(X_scaled, 
                                                    Y, 
                                                    A,
                                                    test_size = 0.2,
                                                    random_state=0,
                                                    stratify=Y)

# Work around indexing issue
X_train = X_train.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)

# Improve labels
A_test.Sex.loc[(A_test['Sex'] == 0)] = 'female'
A_test.Sex.loc[(A_test['Sex'] == 1)] = 'male'


A_test.Race.loc[(A_test['Race'] == 0)] = 'Amer-Indian-Eskimo'
A_test.Race.loc[(A_test['Race'] == 1)] = 'Asian-Pac-Islander'
A_test.Race.loc[(A_test['Race'] == 2)] = 'Black'
A_test.Race.loc[(A_test['Race'] == 3)] = 'Other'
A_test.Race.loc[(A_test['Race'] == 4)] = 'White'

In [10]:
X_train.head(2)

Unnamed: 0,Age,Workclass,Education-Num,Marital Status,Occupation,Relationship,Capital Gain,Capital Loss,Hours per week,Country
0,-1.435581,-2.65732,-0.03136,0.921634,-1.554283,-0.281263,-0.14592,-0.21666,-0.845327,0.291569
1,0.837109,0.09005,-0.03136,-0.406212,0.101036,0.856261,-0.14592,-0.21666,-0.035429,0.291569


# 3. Train a classification model

In [11]:
lr_predictor = LogisticRegression(solver='liblinear', fit_intercept=True)
lr_predictor.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
from sklearn.metrics import roc_auc_score, accuracy_score
roc_auc_score(Y_test, lr_predictor.predict_proba(X_test)[:,1])

0.8801844524462971

In [13]:
accuracy_score(Y_test, lr_predictor.predict(X_test))

0.836480884385076

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, lr_predictor.predict(X_test))

array([[4615,  330],
       [ 735,  833]])

In [15]:
A_test.head(2)

Unnamed: 0,Sex,Race
0,male,White
1,male,Asian-Pac-Islander


## 3.1 View this model in Fairlearn's fairness dashboard, and see the disparities which appear

In [16]:
from fairlearn.widget import FairlearnDashboard
FairlearnDashboard(sensitive_features=A_test, 
                   sensitive_feature_names=['Sex', 'Race'],
                   y_true=Y_test,
                   y_pred={"lr_model": lr_predictor.predict(X_test)})

FairlearnWidget(value={'true_y': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0…

<fairlearn.widget._fairlearn_dashboard.FairlearnDashboard at 0x7f2882a4ba58>

# 4. Mitigation of unfairness with GreadSearch algorithm

In [17]:
from fairlearn.reductions import GridSearch
from fairlearn.reductions import DemographicParity, ErrorRate

In [18]:
# Train GridSearch
sweep = GridSearch(LogisticRegression(solver='liblinear', fit_intercept=True),
                   constraints=DemographicParity(),
                   grid_size=15)

In [20]:
sweep.fit(X_train, Y_train, sensitive_features=A_train.Sex)
predictors = sweep._predictors

In [22]:
errors, disparities = [], []
for m in predictors:
    classifier = lambda X: m.predict(X)
    
    error = ErrorRate()
    error.load_data(X_train, pd.Series(Y_train), sensitive_features=A_train.Sex)
    disparity = DemographicParity()
    disparity.load_data(X_train, pd.Series(Y_train), sensitive_features=A_train.Sex)
    
    errors.append(error.gamma(classifier)[0])
    disparities.append(disparity.gamma(classifier).max())

all_results = pd.DataFrame( {'predictor':predictors, 'error': errors, 'disparity': disparities} )

all_models_dict = {'census_unmitigated': lr_predictor}
dominant_models_dict = {'census_unmitigated': lr_predictor}
base_name_format = 'census_grid_model_{0}'
row_id = 0
for row in all_results.itertuples():
    model_name = base_name_format.format(row_id)
    all_models_dict[model_name] = row.predictor
    errors_for_lower_or_rq_disparity = all_results['error'][all_results['disparity']<=row.disparity]
    if row.error <= errors_for_lower_or_rq_disparity.min():
        dominant_models_dict[model_name] = row.predictor
    row_id += 1

In [23]:
dashboard_all = dict()
for name, predictor in all_models_dict.items():
    value = predictor.predict(X_test)
    dashboard_all[name] = value

In [24]:
FairlearnDashboard(sensitive_features=A_test, 
                   sensitive_feature_names=['Sex', 'Race'],
                   y_true=Y_test,
                   y_pred=dashboard_all)

FairlearnWidget(value={'true_y': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0…

<fairlearn.widget._fairlearn_dashboard.FairlearnDashboard at 0x7f287c57b390>