# Fairness Tree Example

Xavier & Francis

Example of the **fairness tree** with the `german-risk-scoring.csv` dataset.

![Fairness Tree](./fairness_tree.png)

## Imports

In [46]:
# imports
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

## Data Load

In [2]:
# dataset
data = pd.read_csv('german-risk-scoring.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   Status of existing checking account                       1000 non-null   object
 1   Duration in month                                         1000 non-null   int64 
 2   Credit history                                            1000 non-null   object
 3   Purpose                                                   1000 non-null   object
 4   Credit amount                                             1000 non-null   int64 
 5   Savings account/bonds                                     1000 non-null   object
 6   Present employment since                                  1000 non-null   object
 7   Installment rate in percentage of disposable income       1000 non-null   int64 
 8   Personal status and sex      

In [3]:
# y
data['Cost Matrix(Risk)'].value_counts()

Good Risk    700
Bad Risk     300
Name: Cost Matrix(Risk), dtype: int64

In [4]:
# Personal status and sex
data["Personal status and sex"].value_counts()

male:single                          548
female:divorced/separated/married    310
male:married/widowed                  92
male:divorced/separated               50
Name: Personal status and sex, dtype: int64

## Data Prep

In [5]:
# create sex column
data["sex"] = data["Personal status and sex"].map(lambda x : x.split(":")[0])

In [6]:
# create X=features, y=target
X = data.drop(columns = 'Cost Matrix(Risk)')
y = data['Cost Matrix(Risk)'].map({"Good Risk": 1, "Bad Risk": 0})

(    Status of existing checking account  Duration in month  \
 0                                 <0 DM                  6   
 1                          0 <= <200 DM                 48   
 2                   no checking account                 12   
 3                                 <0 DM                 42   
 4                                 <0 DM                 24   
 ..                                  ...                ...   
 995                 no checking account                 12   
 996                               <0 DM                 30   
 997                 no checking account                 12   
 998                               <0 DM                 45   
 999                        0 <= <200 DM                 45   
 
                                Credit history              Purpose  \
 0                            critical account     radio/television   
 1    existing credits paid back duly till now     radio/television   
 2                           

In [8]:
# type modifications

cols_cat = [
    'Status of existing checking account',
    'Credit history',
    'Purpose',
    'Savings account/bonds',
    'Present employment since',
    'Personal status and sex', 
    'Other debtors / guarantors',
    'Property',
    'Other installment plans',
    'Housing',
    'Job',
    'Telephone',
    'foreign worker',
    'sex'
       ]

cols_num = [
    'Duration in month',
    'Credit amount',
    'Installment rate in percentage of disposable income',
    'Present residence since',
    'Age in years',
    'Number of existing credits at this bank',
    'Number of people being liable to provide maintenance for',
    ]

for col in cols_cat:
    data[col] = data[col].astype(str)
    
for col in cols_num:
    data[col] = data[col].astype(float)

cols = cols_cat + cols_num

In [18]:
# unique values of categorical columns

X[cols_cat].nunique()

Status of existing checking account     4
Credit history                          5
Purpose                                10
Savings account/bonds                   5
Present employment since                5
Personal status and sex                 4
Other debtors / guarantors              3
Property                                4
Other installment plans                 3
Housing                                 3
Job                                     4
Telephone                               2
foreign worker                          2
sex                                     2
dtype: int64

In [27]:
# all to numbers

encoder = OneHotEncoder()
X_cat = encoder.fit_transform(X[cols_cat]).toarray()
X_num = X[cols_num]
X_prep = np.concatenate((X_num, X_cat), axis=1)
X_prep.shape

(1000, 63)

In [34]:
# get values list from one hot encoding of categorical data

encoder.get_feature_names(input_features=X[cols_cat].columns)

array(['Status of existing checking account_0 <= <200 DM',
       'Status of existing checking account_<0 DM',
       'Status of existing checking account_>= 200 DM ',
       'Status of existing checking account_no checking account',
       'Credit history_all credits at this bank paid back duly',
       'Credit history_critical account',
       'Credit history_delay in paying off',
       'Credit history_existing credits paid back duly till now',
       'Credit history_no credits taken', 'Purpose_business',
       'Purpose_car (new)', 'Purpose_car (used)',
       'Purpose_domestic appliances', 'Purpose_education',
       'Purpose_furniture/equipment', 'Purpose_others',
       'Purpose_radio/television', 'Purpose_repairs',
       'Purpose_retraining', 'Savings account/bonds_100 <= <500 DM',
       'Savings account/bonds_500 <= < 1000 DM',
       'Savings account/bonds_<100 DM',
       'Savings account/bonds_>= 1000 DM',
       'Savings account/bonds_no savings account',
       'Present

In [38]:
# data prepared

cols = data[cols_num].columns.tolist() + encoder.get_feature_names(input_features=X[cols_cat].columns).tolist()
data_prep = pd.DataFrame(X_prep, columns=cols)
data_prep.shape

(1000, 63)

In [39]:
# data prepared

data_prep.head()

Unnamed: 0,Duration in month,Credit amount,Installment rate in percentage of disposable income,Present residence since,Age in years,Number of existing credits at this bank,Number of people being liable to provide maintenance for,Status of existing checking account_0 <= <200 DM,Status of existing checking account_<0 DM,Status of existing checking account_>= 200 DM,...,Job_management/ highly qualified employee,Job_skilled employee / official,Job_unemployed/ unskilled - non-resident,Job_unskilled - resident,Telephone_none,Telephone_yes,foreign worker_no,foreign worker_yes,sex_female,sex_male
0,6.0,1169.0,4.0,4.0,67.0,2.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,48.0,5951.0,2.0,2.0,22.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,12.0,2096.0,2.0,3.0,49.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
3,42.0,7882.0,2.0,4.0,45.0,1.0,2.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,24.0,4870.0,3.0,4.0,53.0,2.0,2.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


## Machine Learning

In [71]:
# split train test

X_train, X_test, y_train, y_test = train_test_split(data_prep, y, test_size=0.2, random_state=42)
X_train = X_train.copy()
X_test = X_test.copy()
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800, 63) (200, 63) (800,) (200,)


In [73]:
# train model

clf = LogisticRegression(random_state=0, n_jobs=8, max_iter=500)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=500, n_jobs=8, random_state=0)

In [82]:
# function print_results()

def print_results(y_label, y_pred):
    print("accuracy_score = {:.3f}".format(accuracy_score(y_label, y_pred)))
    print("f1_score = {:.3f}".format(f1_score(y_label, y_pred)))
    print("confusion_matrix:\n", confusion_matrix(y_label, y_pred))

In [83]:
# predict on train set

y_pred = clf.predict(X_train)
print_results(y_train, y_pred)

accuracy_score = 0.775
f1_score = 0.847
confusion_matrix:
 [[122 119]
 [ 61 498]]


In [85]:
# predict on test set

y_pred = clf.predict(X_test)
print_results(y_test, y_pred)

accuracy_score = 0.785
f1_score = 0.853
confusion_matrix:
 [[ 32  27]
 [ 16 125]]


## Fairness Analysis

In [99]:
# select columns to analyse

cols_num_to_analyse = [
#     'Duration in month',
     'Credit amount',
#     'Installment rate in percentage of disposable income',
#     'Present residence since',
     'Age in years',
#     'Number of existing credits at this bank',
#     'Number of people being liable to provide maintenance for',
]

cols_cat_to_analyse = [
#     'Status of existing checking account',
#     'Credit history',
#     'Purpose',
#     'Savings account/bonds',
#     'Present employment since',
#     'Personal status and sex', 
#     'Other debtors / guarantors',
#     'Property',
#     'Other installment plans',
#     'Housing',
#     'Job',
#     'Telephone',
#     'foreign worker',
     'sex',
]

cols_to_analyse = cols_num_to_analyse + cols_cat_to_analyse

# build analysis
analysis = X.loc[X_test.index, cols_to_analyse].copy()
analysis['label'] = y_test
analysis['prediction'] = y_pred

In [118]:
# X_test for analysis

cols_to_analyse_in_X_space = cols_num_to_analyse + [col for col in data_prep.columns for prefix in cols_cat_to_analyse if col.startswith(prefix)]
X_test[cols_to_analyse_in_X_space].head()

Unnamed: 0,Credit amount,Age in years,sex_female,sex_male
521,3190.0,24.0,1.0,0.0
737,4380.0,35.0,0.0,1.0
740,2325.0,32.0,0.0,1.0
660,1297.0,23.0,0.0,1.0
411,7253.0,35.0,0.0,1.0


In [116]:
# analysis

analysis.head()

Unnamed: 0,Credit amount,Age in years,sex,label,prediction
521,3190,24,female,0,1
737,4380,35,male,1,0
740,2325,32,male,1,0
660,1297,23,male,1,1
411,7253,35,male,1,1


In [86]:
print_results(analysis.loc[analysis['sex_female']==1, 'label'], analysis.loc[analysis['sex_female']==1, 'prediction'])

accuracy_score = 0.821
f1_score = 0.878
confusion_matrix:
 [[10  7]
 [ 3 36]]


In [87]:
print_results(analysis.loc[analysis['sex_male']==1, 'label'], analysis.loc[analysis['sex_male']==1, 'prediction'])

accuracy_score = 0.771
f1_score = 0.844
confusion_matrix:
 [[22 20]
 [13 89]]


In [104]:
# Prediction balance
analysis.groupby('sex').size()

sex
female     56
male      144
dtype: int64

In [106]:
56/200

0.28

In [107]:
# Prediction balance
analysis.groupby('sex')['prediction'].mean()

sex
female    0.767857
male      0.756944
Name: prediction, dtype: float64

In [113]:
analysis.loc[(analysis['label']==0) & (analysis['prediction']==1)].groupby('sex').size()

sex
female     7
male      20
dtype: int64

**to be completed...**