In [1]:
import ot
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy

In [2]:
import sklearn
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler

### Generate the problem / prepare selected datasets

In [3]:
numerical_features_index = [0, 2, 4, 10, 11, 12]

data_train = pd.read_csv('../datasets/Adult income dataset/adult.data', header=None, nrows=10000)
y_train = data_train.loc[:, data_train.columns[-1]]
gender_train = data_train.loc[:, data_train.columns[9]]


data_test = pd.read_csv('../datasets/Adult income dataset/adult.test', skiprows=1, header=None)
y_test = data_test.loc[:, data_train.columns[-1]]
gender_test = data_test.loc[:, data_test.columns[9]]

data_train.drop(data_train.columns.difference(numerical_features_index), axis=1, inplace=True)
data_test.drop(data_test.columns.difference(numerical_features_index), axis=1, inplace=True)
data_train = data_train.astype(np.float64)
data_test = data_test.astype(np.float64)



gender_train = gender_train.apply(lambda x: x.strip())
gender_test = gender_test.apply(lambda x: x.strip())

In [4]:
data_train.drop_duplicates(inplace=True)
y_train = y_train.loc[data_train.index]
gender_train = gender_train.loc[data_train.index]

assert data_train.shape[0] == y_train.shape[0] == gender_train.shape[0]

In [5]:
binarizer = LabelBinarizer()
y_train = binarizer.fit_transform(y_train).ravel()
y_test = binarizer.fit_transform(y_test).ravel()

In [10]:
gender_binarizer = LabelBinarizer()
gender_binary = gender_binarizer.fit_transform(gender_test.values).ravel()

In [11]:
scaler = StandardScaler()
data_train = scaler.fit_transform(data_train)
data_test = scaler.transform(data_test)

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [13]:
from sklearn.metrics import accuracy_score, f1_score

In [14]:
clf = DecisionTreeClassifier()

In [15]:
clf.fit(data_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [16]:
print('Initial accuracy of classification: {}'.format(accuracy_score(y_test, clf.predict(data_test))))

Initial accuracy of classification: 0.7670290522695166


In [18]:
male_entries_index = gender_train[gender_train == 'Male'].index
female_entries_index = gender_train[gender_train == 'Female'].index

male_entries_test_index = gender_test[gender_test == 'Male'].index
female_entries_test_index = gender_test[gender_test == 'Female'].index

In [19]:
n0 = data_train[gender_train == 'Male'].shape[0]
n1 = data_train[gender_train == 'Female'].shape[0]

emperical_histogram_male = np.ones_like(male_entries_index) / n0

emperical_histogram_female = np.ones_like(female_entries_index) / n1

In [20]:
C = ot.dist(data_train[gender_train == 'Male'], data_train[gender_train == 'Female'])

In [21]:
coupling = ot.emd(emperical_histogram_male, emperical_histogram_female, C, numItermax=1000000000)

In [40]:
def confusion_matrix(predictions, protected_attribute):
    n = predictions.shape[0]
    a = np.sum((predictions == 0) & (protected_attribute == 0)) / n
    b = np.sum((predictions == 1) & (protected_attribute == 0)) / n
    c = np.sum((predictions == 0) & (protected_attribute == 1)) / n
    d = np.sum((predictions == 1) & (protected_attribute == 1)) / n
    return a, b, c, d
      
def balanced_error_rate(y_true, y_pred, protected_attributes):
    pass

In [41]:
def likelihood_ratio(y_predicted, protected):
    #following https://arxiv.org/pdf/1412.3756.pdf
    a, b, c, d = confusion_matrix(y_predicted, protected)
    return d * (a + c) / c / (b + d)

In [42]:
def disparate_impact(y_true, protected):
    return 1. / likelihood_ratio(y_true, protected)

In [43]:
def construct_map(weights, data_first, data_second, coupling):
    weight_1, weight_2 = weights
    print(weights)
    n0 = data_train[gender_train == 'Male'].shape[0]
    n1 = data_train[gender_train == 'Female'].shape[0]
    mapped_class_0 = weight_1 * data_first  + n0 * weight_2 * (coupling @ data_second)
    mapped_class_1 = weight_1 * n1 * (coupling.T @ data_first) + weight_2 * data_second
    return mapped_class_0, mapped_class_1

In [44]:
def transform_dataset(weights, data_first, data_second, y, coupling):
    mapped_class_0, mapped_class_1 = construct_map(weights, data_first, data_second, coupling)
    y_mapped_0 = y[gender_train == 'Male']
    y_mapped_1 = y[gender_train == 'Female']
    return np.concatenate((mapped_class_0, mapped_class_1)), np.concatenate((y_mapped_0, y_mapped_1))

In [68]:
X_new, y_new = transform_dataset([0.5, 0.5], data_train[gender_train == 'Male'], data_train[gender_train == 'Female'], y_train, coupling)

[0.5, 0.5]


In [69]:
disparate_impact(clf.predict(data_test), gender_binary)

0.8655925740717588

In [70]:
clf1 = DecisionTreeClassifier()
clf1.fit(X_new, y_new)

accuracy_score(y_test, clf1.predict(data_test))

0.7271052146674037

In [71]:
disparate_impact(clf1.predict(data_test), gender_binary)

0.8921330565083949

### Show that different solutions for the same transporation problem can have different fairness

### Explore how the regularized barycenters are defined, try to apply them to the problem

### Use the regularized transport maps instead of usual ones