In [1]:
import ot
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
from sklearn.base import clone

In [2]:
import sklearn
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler

### Generate the problem / prepare selected datasets

In [3]:
numerical_features_index = [0, 2, 4, 10, 11, 12]

data_train = pd.read_csv('../datasets/Adult income dataset/adult.data', header=None)
y_train = data_train.loc[:, data_train.columns[-1]]
gender_train = data_train.loc[:, data_train.columns[9]]


data_test = pd.read_csv('../datasets/Adult income dataset/adult.test', skiprows=1, header=None)
y_test = data_test.loc[:, data_train.columns[-1]]
gender_test = data_test.loc[:, data_test.columns[9]]

data_train.drop(data_train.columns.difference(numerical_features_index), axis=1, inplace=True)
data_test.drop(data_test.columns.difference(numerical_features_index), axis=1, inplace=True)
data_train = data_train.astype(np.float64)
data_test = data_test.astype(np.float64)



gender_train = gender_train.apply(lambda x: x.strip())
gender_test = gender_test.apply(lambda x: x.strip())

In [4]:
data_train.drop_duplicates(inplace=True)
y_train = y_train.loc[data_train.index]
gender_train = gender_train.loc[data_train.index]

assert data_train.shape[0] == y_train.shape[0] == gender_train.shape[0]

In [5]:
binarizer = LabelBinarizer()
y_train = binarizer.fit_transform(y_train).ravel()
y_test = binarizer.fit_transform(y_test).ravel()

In [6]:
gender_binarizer = LabelBinarizer()
gender_binary = gender_binarizer.fit_transform(gender_test.values).ravel()

In [7]:
scaler = StandardScaler()
data_train = scaler.fit_transform(data_train)
data_test = scaler.transform(data_test)

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

  from numpy.core.umath_tests import inner1d


In [9]:
clf_base = RandomForestClassifier()

In [10]:
clf = clone(clf_base)

clf.fit(data_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
print('Initial accuracy of classification: {}'.format(accuracy_score(y_test, clf.predict(data_test))))

Initial accuracy of classification: 0.8113138013635526


In [12]:
male_entries_index = gender_train[gender_train == 'Male'].index
female_entries_index = gender_train[gender_train == 'Female'].index

male_entries_test_index = gender_test[gender_test == 'Male'].index
female_entries_test_index = gender_test[gender_test == 'Female'].index

In [13]:
n0 = data_train[gender_train == 'Male'].shape[0]
n1 = data_train[gender_train == 'Female'].shape[0]

emperical_histogram_male = np.ones_like(male_entries_index) / n0

emperical_histogram_female = np.ones_like(female_entries_index) / n1

In [14]:
C = ot.dist(data_train[gender_train == 'Male'], data_train[gender_train == 'Female'])

In [15]:
coupling = ot.emd(emperical_histogram_male, emperical_histogram_female, C, numItermax=1000000000)

In [16]:
def confusion_matrix(predictions, protected_attribute):
    n = predictions.shape[0]
    a = np.sum((predictions == 0) & (protected_attribute == 0)) / n
    b = np.sum((predictions == 1) & (protected_attribute == 0)) / n
    c = np.sum((predictions == 0) & (protected_attribute == 1)) / n
    d = np.sum((predictions == 1) & (protected_attribute == 1)) / n
    return a, b, c, d
      
def balanced_error_rate(y_true, y_pred, protected_attributes):
    pass

In [17]:
def likelihood_ratio(y_predicted, protected):
    #following https://arxiv.org/pdf/1412.3756.pdf
    a, b, c, d = confusion_matrix(y_predicted, protected)
    return d * (a + c) / c / (b + d)

In [18]:
def disparate_impact(y_true, protected):
    return 1. / likelihood_ratio(y_true, protected)

In [19]:
def construct_map(weights, data_first, data_second, coupling):
    weight_1, weight_2 = weights
    print(weights)
    n0 = data_train[gender_train == 'Male'].shape[0]
    n1 = data_train[gender_train == 'Female'].shape[0]
    mapped_class_0 = weight_1 * data_first  + n0 * weight_2 * (coupling @ data_second)
    mapped_class_1 = weight_1 * n1 * (coupling.T @ data_first) + weight_2 * data_second
    return mapped_class_0, mapped_class_1

In [20]:
def transform_dataset(weights, data_first, data_second, y, coupling):
    mapped_class_0, mapped_class_1 = construct_map(weights, data_first, data_second, coupling)
    y_mapped_0 = y[gender_train == 'Male']
    y_mapped_1 = y[gender_train == 'Female']
    return np.concatenate((mapped_class_0, mapped_class_1)), np.concatenate((y_mapped_0, y_mapped_1))

In [21]:
from tqdm import tqdm_notebook

In [32]:
print('class 0 size {}'.format(data_train[gender_train == 'Female'].shape))
print('class 1 size {}'.format(data_train[gender_train == 'Male'].shape))

class 0 size (10720, 6)
class 1 size (21614, 6)


In [22]:
for repair_value in tqdm_notebook(np.linspace(0, 1, num=10)):
    X_new, y_new = transform_dataset([repair_value, 1-repair_value], data_train[gender_train == 'Male'], data_train[gender_train == 'Female'], y_train, coupling)
    clf1 = clone(clf_base)
    clf1.fit(X_new, y_new)
    print(repair_value)
    print('Accuracy score {}'.format(accuracy_score(y_test, clf1.predict(data_test))))
    print('Disparate impact {}'.format(disparate_impact(clf1.predict(data_test), gender_binary)))

A Jupyter Widget

[0.0, 1.0]
0.0
Accuracy score 0.7818315828266077
Disparate impact 0.864289440051835
[0.1111111111111111, 0.8888888888888888]
0.1111111111111111
Accuracy score 0.7769178797371169
Disparate impact 0.8751509141164451
[0.2222222222222222, 0.7777777777777778]
0.2222222222222222
Accuracy score 0.7809102634973282
Disparate impact 0.8643577715213846
[0.3333333333333333, 0.6666666666666667]
0.3333333333333333
Accuracy score 0.7810945273631841
Disparate impact 0.8601833458554144
[0.4444444444444444, 0.5555555555555556]
0.4444444444444444
Accuracy score 0.7866838646274799
Disparate impact 0.8560101918465228
[0.5555555555555556, 0.4444444444444444]
0.5555555555555556
Accuracy score 0.7931330999324366
Disparate impact 0.8461476137251492
[0.6666666666666666, 0.33333333333333337]
0.6666666666666666
Accuracy score 0.7907376696763098
Disparate impact 0.8511425294991748
[0.7777777777777777, 0.22222222222222232]
0.7777777777777777
Accuracy score 0.7947300534365211
Disparate impact 0.8531187122736418
[0.8

In [23]:
X_new, y_new = transform_dataset([0.5, 0.5], data_train[gender_train == 'Male'], data_train[gender_train == 'Female'], y_train, coupling)

[0.5, 0.5]


In [24]:
disparate_impact(clf.predict(data_test), gender_binary)

0.8320550119016135

In [25]:
clf1 = clone(clf_base)
clf1.fit(X_new, y_new)

accuracy_score(y_test, clf1.predict(data_test))

0.7839199066396413

In [26]:
disparate_impact(clf1.predict(data_test), gender_binary)

0.8480934154728923

### Show that different solutions for the same transporation problem can have different fairness

### Explore how the regularized barycenters are defined, try to apply them to the problem

### Use the regularized transport maps instead of usual ones