In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from metrics import Metrics
from aif360.sklearn.metrics import statistical_parity_difference

%load_ext autoreload
%autoreload 2

# Analysis of Bias Symptoms

Features to consider on binary or categorical features:

- Correlation with the label
- Unbalance of values
- Value of different definitions of fairness metrics
 

In [160]:
def analysis(test, predicted_label, true_label, positive_value):
    binary_variables = [c for c in test.columns if test[c].nunique() == 2]
    test = test[binary_variables]
    correlation = test.corr()["prediction"].sort_values(ascending=False)
    correlation[abs(correlation) > 0.1]
    symptoms = (
        pd.DataFrame(correlation[abs(correlation) > 0.1])
        .drop([true_label, predicted_label])
        .rename(columns={"prediction": "correlation"})
    )
    symptoms.reset_index(inplace=True)
    symptoms.rename(columns={"index": "variable"}, inplace=True)
    metrics = Metrics(test, predicted_label, true_label, positive_value)
    sp = []
    eo = []
    unbalance = []
    for i in symptoms["variable"]:
        unbalance.append(metrics.group_ratio({i: 0}))
        sp.append(metrics.statistical_parity({i: 0}))
        eo.append(metrics.equalized_odds({i: 0}))
    symptoms["unbalance"] = unbalance
    symptoms["statistical_parity"] = sp
    symptoms["equalized_odds"] = eo
    return symptoms

## Test su dataset bias

In [2]:
adult = pd.read_csv('data/adult.csv', index_col=0)
train, test = train_test_split(adult, test_size=0.2, random_state=0)

In [3]:
model = LogisticRegression()
model.fit(train.drop(columns='income', axis=1), train['income'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [4]:
test['prediciton'] = model.predict(test.drop(columns='income', axis=1))

In [7]:
test

Unnamed: 0,race,sex,income,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,Separated,Widowed,?,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,...,Hong,Hungary,India,Iran,Ireland,Italy,Jamaica,Japan,Mexico,Nicaragua,Outlying-US(Guam-USVI-etc),Peru,Philippines,Poland,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,Not known,age<10,age between 10 and 20,age between 20 and 30,age between 30 and 40,age between 40 and 50,age between 50 and 60,age between 60 and 70,age>70,hour<10,hours between 10 and 20,hours between 20 and 30,hours between 30 and 40,hour between 40 and 50,hour>70,prediciton
17764,1.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3597,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
19478,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
13484,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
27239,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30642,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2187,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
5096,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
5428,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0


In [15]:
results = permutation_importance(model, test.drop(columns=['income', 'prediction'], axis=1), test['prediction'])

In [19]:
symptoms = pd.DataFrame(results.importances_mean, index=test.drop(columns=['income', 'prediction'], axis=1).columns, columns=['importance'])

In [21]:
symptoms.sort_values(by='importance', ascending=False, inplace=True)

In [23]:
symptoms = symptoms[symptoms['importance'] > 0]

In [25]:
binary_variables = [c for c in test.columns if test[c].nunique() == 2]

In [28]:
symptoms = symptoms[symptoms.index.isin(binary_variables)]

In [29]:
symptoms

Unnamed: 0,importance
Married-civ-spouse,0.078022
Bachelors,0.050097
hour>70,0.044570
Never-married,0.038009
Exec-managerial,0.036199
...,...
Preschool,0.000097
Haiti,0.000097
Trinadad&Tobago,0.000097
Outlying-US(Guam-USVI-etc),0.000032


In [13]:
test.shape

(6188, 103)

In [9]:
pd.DataFrame(test.columns, columns=['variable'])

Unnamed: 0,variable
0,race
1,sex
2,income
3,10th
4,11th
...,...
97,hours between 10 and 20
98,hours between 20 and 30
99,hours between 30 and 40
100,hour between 40 and 50


In [10]:
test['prediction'] = model.predict(test.drop(columns='income'))

In [164]:
test.shape

(6188, 103)

### Select only binary variables

In [165]:
## select only binary variables from the dataset
binary_variables = [c for c in test.columns if test[c].nunique() == 2]

In [166]:
test = test[binary_variables]

In [167]:
test.shape

(6188, 96)

### Correlation with the label

In [168]:
correlation = test.corr()['prediction'].sort_values(ascending=False)

In [169]:
# Select only variables with a correlation greater than 0.1 or less than -0.1

correlation[abs(correlation) > 0.1]

prediction                 1.000000
income                     0.535083
 Married-civ-spouse        0.494938
 Husband                   0.429015
 Exec-managerial           0.320516
 Bachelors                 0.290698
hour>70                    0.284638
 Prof-specialty            0.268540
 Masters                   0.224940
sex                        0.209143
 Prof-school               0.203825
age between 40 and 50      0.196858
 Doctorate                 0.192615
 Wife                      0.171138
age between 50 and 60      0.138003
race                       0.106336
 Handlers-cleaners        -0.103299
 Adm-clerical             -0.103940
hours between 20 and 30   -0.109743
 Machine-op-inspct        -0.114214
age between 10 and 20     -0.117210
 Craft-repair             -0.124041
 Unmarried                -0.159588
 Other-service            -0.167775
 Divorced                 -0.174746
 Own-child                -0.209554
 HS-grad                  -0.233300
age between 20 and 30     -0

In [170]:
symptoms = pd.DataFrame(correlation[abs(correlation) > 0.1]).drop(['prediction', 'income']).rename(columns={'prediction': 'correlation'})

In [171]:
symptoms.reset_index(inplace=True)

In [172]:
symptoms.rename(columns={'index': 'variable'}, inplace=True)

In [173]:
symptoms

Unnamed: 0,variable,correlation
0,Married-civ-spouse,0.494938
1,Husband,0.429015
2,Exec-managerial,0.320516
3,Bachelors,0.290698
4,hour>70,0.284638
5,Prof-specialty,0.26854
6,Masters,0.22494
7,sex,0.209143
8,Prof-school,0.203825
9,age between 40 and 50,0.196858


### Compute statistical parity for each variable selected

In [9]:
test

Unnamed: 0,race,sex,income,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,Separated,Widowed,?,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,...,Hong,Hungary,India,Iran,Ireland,Italy,Jamaica,Japan,Mexico,Nicaragua,Outlying-US(Guam-USVI-etc),Peru,Philippines,Poland,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,Not known,age<10,age between 10 and 20,age between 20 and 30,age between 30 and 40,age between 40 and 50,age between 50 and 60,age between 60 and 70,age>70,hour<10,hours between 10 and 20,hours between 20 and 30,hours between 30 and 40,hour between 40 and 50,hour>70,prediciton
17764,1.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3597,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
19478,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
13484,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
27239,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30642,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2187,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
5096,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
5428,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0


In [10]:
metrics = Metrics(test, 'prediciton', 'income', 1)

In [11]:
metrics.statistical_parity({'sex': 0})

-0.17613432795851464

In [15]:
test.set_index('sex', inplace=True)

In [16]:
statistical_parity_difference(test['income'], test['prediciton'], prot_attr=['sex'], priv_group=1)

-0.17613432795851464

In [175]:
sp = []
for i in symptoms['variable']:
    sp.append(metrics.statistical_parity({i: 0}))

In [176]:
symptoms['sp'] = sp

In [177]:
symptoms

Unnamed: 0,variable,correlation,sp
0,Married-civ-spouse,0.494938,-0.392832
1,Husband,0.429015,-0.344856
2,Exec-managerial,0.320516,-0.391635
3,Bachelors,0.290698,-0.318299
4,hour>70,0.284638,-0.282445
5,Prof-specialty,0.26854,-0.320418
6,Masters,0.22494,-0.391142
7,sex,0.209143,-0.176134
8,Prof-school,0.203825,-0.650189
9,age between 40 and 50,0.196858,-0.186059


### Compute equalized odds for each variable selected

In [178]:
eo = []

for i in symptoms['variable']:
    eo.append(metrics.equalized_odds({i: 0}))

In [179]:
symptoms['eo'] = eo

### Groups unbalance

In [180]:
ratio = []
for i in symptoms['variable']:
    ratio.append(metrics.group_ratio({i: 0}))
symptoms['unbalance'] = ratio

In [201]:
symptoms.rename(columns={'sp': 'statistical_parity', 'eo': 'equalized_odds'}, inplace=True)

In [50]:
symptoms.to_csv('symptoms_bias.csv')

## Test on unbiased dataset

In [182]:
data = pd.read_csv("data/kickstarter_proc.csv")
train, test = train_test_split(data, test_size=0.2, random_state=0)

In [183]:
model = LogisticRegression()
model.fit(train.drop(columns='State', axis=1), train['State'])

LogisticRegression()

In [184]:
test['prediction'] = model.predict(test.drop(columns='State'))

In [185]:
test.shape

(74971, 451)

In [186]:
test

Unnamed: 0,State,Goal,Pledged,Backers,Category_Art,Category_Comics,Category_Crafts,Category_Dance,Category_Design,Category_Fashion,...,Deadline_day_23,Deadline_day_24,Deadline_day_25,Deadline_day_26,Deadline_day_27,Deadline_day_28,Deadline_day_29,Deadline_day_30,Deadline_day_31,prediction
241131,0,11362,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230891,1,11606,16962,393,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
96221,1,3707,10041,227,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
264059,1,150,580,24,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
367617,1,10000,10165,81,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335899,1,74,199,9,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135631,0,1844,148,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130416,0,60000,126,3,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3203,0,6000,2455,49,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [187]:
symptoms_nobias = analysis(test, 'prediction', 'State', 1)

In [189]:
symptoms_nobias.sort_values(by='statistical_parity', ascending=False)

Unnamed: 0,variable,correlation,unbalance,statistical_parity,equalized_odds
8,Launched_year_2015,-0.135186,1.10305,0.155026,-0.22294
7,Deadline_year_2015,-0.132857,1.10149,0.15216,-0.215987
5,Country_United States,0.10591,0.693583,-0.11766,0.184042
4,Launched_year_2012,0.111027,0.94045,-0.162706,0.237944
3,Deadline_year_2012,0.115031,0.937879,-0.167677,0.246439
0,Category_Music,0.144162,0.91425,-0.195314,0.177721
2,Deadline_year_2011,0.117962,0.951586,-0.215618,0.265871
1,Launched_year_2011,0.122533,0.948421,-0.219141,0.277985
6,Subcategory_Tabletop Games,0.104414,0.968749,-0.253451,0.195243


In [199]:
symptoms_nobias.to_csv('symptoms_nobias.csv')