In [21]:
import numpy as np
np.random.seed(0)
import pandas as pd
from matplotlib import pyplot as plt
from pandas_profiling import ProfileReport
from aif360.datasets import GermanDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import Reweighing

In [4]:
# Load data, specify protected attribute
credit_data = GermanDataset(
    protected_attribute_names=['age'],
    privileged_classes=[lambda x: x >= 25],
    features_to_drop=['personal_status', 'sex']
)
old_group = [{'age': 1}]
young_group = [{'age': 0}]

In [5]:
# PANDAS PROFILE
credit_df = credit_data.convert_to_dataframe()[0]
credit_df.describe()
# credit_df.profile_report()

Unnamed: 0,month,credit_amount,investment_as_income_percentage,residence_since,age,number_of_credits,people_liable_for,status=A11,status=A12,status=A13,...,housing=A153,skill_level=A171,skill_level=A172,skill_level=A173,skill_level=A174,telephone=A191,telephone=A192,foreign_worker=A201,foreign_worker=A202,credit
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.973,2.845,0.851,1.407,1.155,0.274,0.269,0.063,...,0.108,0.022,0.2,0.63,0.148,0.596,0.404,0.963,0.037,1.3
std,12.058814,2822.736876,1.118715,1.103718,0.356267,0.577654,0.362086,0.446232,0.443662,0.243085,...,0.310536,0.146757,0.4002,0.483046,0.355278,0.490943,0.490943,0.188856,0.188856,0.458487
min,4.0,250.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,12.0,1365.5,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
50%,18.0,2319.5,3.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,24.0,3972.25,4.0,4.0,1.0,2.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0
max,72.0,18424.0,4.0,4.0,1.0,4.0,2.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [8]:
# AIF 360 PROFILE
metrics = BinaryLabelDatasetMetric(credit_df, privileged_groups=old_group, unprivileged_groups=young_group)
print(metrics.consistency())
print(metrics.mean_difference())
print(metrics.disparate_impact())
print(metrics.smoothed_empirical_differential_fairness())
# so. what do any of these numbers mean...

TypeError: 'dataset' should be a BinaryLabelDataset or a MulticlassLabelDataset

In [10]:
# PARTITION
old_df = credit_df[credit_df['age'] == 1]
young_df = credit_df[credit_df['age'] == 0]
# pandas profile on each partition
print(old_df.describe())
print(young_df.describe())

            month  credit_amount  investment_as_income_percentage  \
count  851.000000     851.000000                       851.000000   
mean    20.975323    3325.972973                         3.008226   
std     11.926958    2813.391664                         1.100237   
min      4.000000     250.000000                         1.000000   
25%     12.000000    1386.000000                         2.000000   
50%     18.000000    2359.000000                         3.000000   
75%     24.000000    4085.000000                         4.000000   
max     60.000000   18424.000000                         4.000000   

       residence_since    age  number_of_credits  people_liable_for  \
count       851.000000  851.0         851.000000         851.000000   
mean          2.834313    1.0           1.443008           1.175088   
std           1.089867    0.0           0.596052           0.380266   
min           1.000000    1.0           1.000000           1.000000   
25%           2.000000 

In [37]:
print(plt.hist(credit_df['credit']))
print(plt.hist(old_df['credit']))
print(plt.hist(young_df['credit']))

(array([700.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 300.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)
(array([612.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 239.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)
(array([88.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 61.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)


In [42]:
# PARTITION BY INVESTMENT PERCENTAGE
high_invest = credit_df[credit_df['investment_as_income_percentage'] < 3]
low_invest = credit_df[credit_df['investment_as_income_percentage'] >= 3]
print(plt.hist(credit_df['credit']))
print(plt.hist(high_invest['credit']))
print(plt.hist(low_invest['credit']))

(array([700.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 300.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)
(array([271.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,  96.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)
(array([429.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 204.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)


In [43]:
# PARTITION BY PEOPLE LIABLE FOR
one_liable = credit_df[credit_df['people_liable_for'] == 1]
two_liable = credit_df[credit_df['people_liable_for'] == 2]
print(plt.hist(credit_df['credit']))
print(plt.hist(one_liable['credit']))
print(plt.hist(two_liable['credit']))

(array([700.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 300.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)
(array([591.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 254.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)
(array([109.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,  46.]), array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]), <a list of 10 Patch objects>)
