In [1]:
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from fairlearn.postprocessing import _relabeling
from fairlearn.datasets import fetch_adult

In [2]:
round_value = 10

In [3]:
# Load the dataset
dataset = fetch_adult(as_frame=True)
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['>50k'] = (dataset.target == '>50K') * 1

# Preprocessing
le = preprocessing.LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
onehot = ['workclass', 'education', 'marital-status', 'occupation', 'marital-status', 'occupation', 'relationship', 'race']
df = pd.get_dummies(df,prefix=onehot, columns = onehot, drop_first=True)

# Reverse the labels to have positive discrimination
tmp = df['sex'].to_list()
for i in range(0,len(tmp)):
    if tmp[i] == 1:
        tmp[i] = 0
    else:
        tmp[i] = 1
df['sex'] = tmp
sensitive = df['sex']
y = df[">50k"]
X = df.loc[:, ~df.columns.isin(['sex', '>50k', 'native-country'])]
print(_relabeling.discrimination_dataset(y, sensitive))

0.19451574596420296


In [4]:
y = y.to_numpy()
sensitive = sensitive.to_numpy()
X = X.to_numpy()

In [5]:
clf = DecisionTreeClassifier(max_depth=3, random_state=0)
clf.fit(X, list(y))
y_pred = clf.predict(X)
accuracy = round(accuracy_score(y, y_pred),round_value)
print("Accuracy of the prediction: ", accuracy)
discrimination= round(_relabeling.discrimination_dataset(y_pred, sensitive), round_value)
print("Discrimination of classifier on the prediction: ", discrimination)

Accuracy of the prediction:  0.8396666803
Discrimination of classifier on the prediction:  0.1886452274


In [6]:
threshold = 0.1

In [7]:
_relabeling.relabeling(clf, X, y, y_pred, sensitive, threshold)
y_pred_relabel = clf.predict(X)
accuracy_relabel = round(accuracy_score(y, y_pred_relabel),round_value)
print("Accuracy of the prediction: ", accuracy_relabel)
discrimination_relabel = round(_relabeling.discrimination_dataset(y_pred_relabel, sensitive), round_value)
print("Discrimination of classifier on the prediction: ", discrimination_relabel)

Accuracy of the prediction:  0.8047991483
Discrimination of classifier on the prediction:  0.0354752427


# Detailed operations

In [8]:
clf = DecisionTreeClassifier(max_depth=4, random_state=0)
clf.fit(X, list(y))
y_pred = clf.predict(X)
accuracy = round(accuracy_score(y, y_pred), round_value)
print("Accuracy of the prediction: ", accuracy)
discrimination= round(_relabeling.discrimination_dataset(y_pred, sensitive), round_value)
print("Discrimination of classifier on the prediction: ", discrimination)

Accuracy of the prediction:  0.8397076287
Discrimination of classifier on the prediction:  0.1885217096


In [9]:
threshold = 0

#### If you want to know which leaves will be relabeled, you can call "leaves_to_relabel(...)"
#### which returns a set of "Leaf()" objects

In [10]:
leaves_relabel = _relabeling.leaves_to_relabel(clf, X, y, y_pred, sensitive, threshold)

sum_acc = 0
sum_disc = 0
for leaf in leaves_relabel:
    print(leaf)
    print()
    sum_acc += leaf.acc
    sum_disc += leaf.disc
sum_acc = round(sum_acc, round_value) # The effect of relabeling the leaves on accuracy
sum_disc = round(sum_disc, round_value) # The effect of relabeling the leaves on discrimination

Path: format -> (node id, feature, way)
((0, 67, 'right'), (16, 2, 'right'), (24, 3, 'left'), (25, 4, 'left'), (26, -2, 'leaf')) 
node_id: 26 
The effect of relabeling the leaf on accuracy: -0.024917079562671474
The effect of relabeling the leaf on discrimination: -0.1372434915773354 
ratio: 5.508008722777498 
contingency table: 
[0.0, 0.0]
[0.03341386511608861, 0.058330944678760085]
transactions: [7, 11, 15, 19, 20, 24, 36, 40, 57, 133, 140, 164, 166, 201, 205, 213, 220, 231, 252, 284, 308, 318, 321, 325, 343, 355, 365, 373, 396, 398, 399, 435, 436, 441, 445, 452, 457, 462, 466, 522, 526, 541, 544, 545, 546, 552, 555, 556, 557, 558, 562, 570, 574, 578, 587, 594, 603, 626, 647, 654, 656, 673, 690, 699, 701, 708, 718, 721, 775, 776, 795, 799, 815, 829, 846, 847, 848, 858, 866, 868, 872, 878, 879, 885, 889, 905, 914, 920, 921, 922, 931, 935, 945, 964, 980, 984, 991, 998, 1008, 1015, 1016, 1052, 1055, 1068, 1078, 1082, 1089, 1101, 1121, 1127, 1140, 1158, 1170, 1190, 1202, 1239, 1245, 1249

#### The values are accurate to a certain decimal.

In [11]:
_relabeling.relabeling(clf, X, y, y_pred, sensitive, threshold)
y_pred_relabel = clf.predict(X)
accuracy_relabel = round(accuracy_score(y, y_pred_relabel), round_value)
discrimination_relabel = round(_relabeling.discrimination_dataset(y_pred_relabel, sensitive), round_value)
new_acc= round(accuracy+sum_acc, round_value)
new_disc = round(discrimination+sum_disc, round_value)
print(f"Accuracy:\n"
    f"    Before      : {accuracy}\n"
    f"    After       : {accuracy_relabel}\n"
    f"    Expected    : {new_acc}\n"
    f"    Leafs       : {sum_acc}\n"
    f"    Difference  : {round(-accuracy+accuracy_relabel, round_value)}\n"
    f"    Check       : {abs(accuracy_relabel-new_acc) <= 0.000000001}")
print(f"Discrimination:\n"
    f"    Before      : {discrimination}\n"
    f"    After       : {discrimination_relabel}\n"
    f"    Expected    : {new_disc}\n"
    f"    Leafs       : {sum_disc }\n"
    f"    Difference  : {round(-discrimination+discrimination_relabel, round_value)}\n"
    f"    Check       : {abs(discrimination_relabel-new_disc) <= 0.000000001}")

Accuracy:
    Before      : 0.8397076287
    After       : 0.7990049548
    Expected    : 0.7990049548
    Leafs       : -0.0407026739
    Difference  : -0.0407026739
    Check       : True
Discrimination:
    Before      : 0.1885217096
    After       : -0.029440035
    Expected    : -0.0294400351
    Leafs       : -0.2179617447
    Difference  : -0.2179617446
    Check       : True
