In [194]:
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import dalex as dx
import numpy as np
import pandas as pd
import sklearn
from copy import copy
import re
import plotly.express as px

In [195]:
# load models 
%run backend_notebook.ipynb

loading data ...
loading models ...
loading pipes ...
all loaded


### Pick a model
Possibilities:
- "logreg": Use and train a logistic regression
- "decisiontree": Use and train a decision tree
- "randomforest": Use and train a random forest


### Pick a bias to check
__Please, age must be selected between 25 and 60.__

Possibilities:
- "sexm": to study a bias on gender in favor of men
- "sexf": to study a bias on gender in favor of women

- "xx+": to study a bias on age in favor of people over the threshold (example: "30+" means bias in favor of people over 30 years old")
- "xx-": to study a bias on age in favor of people under the threshold (example: "40-" means bias in favor of people below 40 years old")

- "sexX xx+": to study bias on cross-subgroups. use a space between gender and age (example: "sexm 25+")


In [216]:
MODEL_TYPE = "randomforest"
BIAS_TYPE = "sexf 30+"

dict_infos = get_model_bias(MODEL_TYPE, BIAS_TYPE)
#

Everything looks good, let's continue ! You can run the following cell ! 


In [208]:
%%capture
explainers_fairness = create_all_explainers_fairness_objects(dict_infos)

### Pick a plot type
- "default": display a grouped bar plot which can bench all models or all groups.
- "radar": display a radar plot which can bench all models or all groups.
- "stacked": display a stacked bar plot which can bench all models or all groups (it's cumulative).
- "heatmap": display a heatmap plot which can bench all models or all groups (models by row, metrics by columns).

### Pick bias metrics to control
You CANNOT change metrics for the default plot. Which is use on the base model.

- "TPR": Compare probabilities of correctly according the credit (TP)
- "TNR": Compare probabilities of correctly NOT according the credit (TN)
- "FPR": Compare probabilities of wrongly according the credit (FP)
- "FNR": Compare probabilities of wrongly NOT according the credit (FN)
- "PPV": Compare precision of according the credit (True Positive / (True Positive + False Positive))
- "NPV": Compare precision of NOT according the credit (True Negative / (True Negative + False Negative))
- "FDR": Compare ratio of wrongly according the credit over all predictions of according the credit (FP/(TP+FP))
- "FOR": Compare ratio of wrongly NOT according the credit over all predictions of NOT according the credit (FN/(TN+FN))
- "ACC": Compare accuracies (precision of correctly according and NOT according the credit) ((TP+TN)/(TP+TN+FP+FN))
- "STP": Compare ratio of according the credit ((TP+FP)/(TP+TN+FP+FN))


In [209]:
METRICS_LIST = ["TPR", "ACC", "PPV", "FPR", "STP", "FDR"]
GRAPHIC = "radar"

In [None]:
# %%javascript
# IPython.OutputArea.auto_scroll_threshold = 9999;

In [210]:
display_all(METRICS_LIST, GRAPHIC, explainers_fairness, dict_infos)

Graphic type is incorrect, the default one will be used
Let's check the fairness performance of the selected model according to the specific populations declared:
  This is a default graph produced by the dalex library. It does not take into account selection made for METRICS_LIST and GRAPHIC.
    
    
Bias detected in 2 metrics: FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'female_old'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
                   TPR       ACC       PPV       FPR       STP
female_young  0.818278  0.841191  0.862027  0.815552  0.762874
male_old      0.995749  0.941687  0.956044  1.374322  1.068263
male_young    0.952179  0.918114  0.926740  1.041591  0.946108


[1m Doing nothing. Is that so bad, really ?[0m

  • If all bars are in the green area, then according to your criteria your model is not biased. However, if you set a threshold for the age, are you sure that moving it a little bit (± 1 to 5 years) will not return a biased result ? Try it to be sure !

  • If a bias has been detected, have a look below to see how you can mitigate it !
  
[1mTrying to mitigate a bias[0m
[1m  Option 1: Remove the sensitive variable[0m
  
    This is a default graph produced by the dalex library. It does not take into account selection made for METRICS_LIST and GRAPHIC.
    
    

Found NaN's or 0's for models: {'base_remove_columns'}
It is advisable to check 'metric_ratios'


How did you model evolved regarding to your fairness metrics ? Is it better without the column ? 

  • If yes, you're lucky this kind of naive preprocessing used to be useless most of the time. Usually the protected and biased variable is correlated with others explanatory variables and then removing it do not helps to unbias your model ! 

  • If no, well that kind of normal, let's see more appropriate ways to deal with biased models.
  
  
[1mTrying to mitigate a bias[0m
[1m  Option 2: Resampling training data[0m
  
    Did you look at the distribution of the biased variable ? Maybe some values of the variable are under-represented or over-represented. Resampling more equally training data would help to mitigate bias due to this king of issue.
    
    Let's compare the effect of this method to the default model:
    
Found NaNs in following models: {'base_preferential_resampling', 'base_uniform_resampling'}


So far, this solution may have resolved the unfairness issue. If not let's see another possibility !
    
    
[1mTrying to mitigate a bias[0m
[1m  Option 3: Reweighting observations[0m
  
    The reweighting algorithm looks at the protected attribute and on the real label. Then, it calculates the probability of assigning favorable label (y=1) assuming the protected attribute and y are independent. Of course, if there is bias, they will be statistically dependent. Then, the algorithm divides calculated theoretical probability by true, empirical probability of this event. That is how weight is created.
    
    Let's compare the effect of this method to the default model:
    
Found NaNs in following models: {'base_reweighted'}


So far, this solution may have resolved the unfairness issue. If not let's see another possibility !
    
    
[1mTrying to mitigate a bias[0m
[1m  Option 4: The ROC-Pivot method (Postprocessing)[0m
  
    This method of mitigation aims to change predictions for items close to the decision frontier.
It switches labels if an observation is from the unprivileged group and on the left (wrong side) of the cutoff. Note that It can also switches labels if an observation is from the privileged group and on the right of the cutoff.
    
    Let's compare the effect of this method to the default model:
    
Found NaNs in following models: {'base_roc-pivot'}


This 3 previous methods are implemented by the library dalex to mitigate bias.
    
    Let's see below all solutions in 1 graphic:
Found NaNs in following models: {'base_roc-pivot', 'base_reweighted', 'base_preferential_resampling', 'base_uniform_resampling'}


'tada'

__Let's go deeper on the 'best' model and compare metrics on differents groups__
### Pick a model to analyse fairness metrics on groups

- "base": for the model you picked initialy
- "remove": for the model without the biased variable
- "sampling_p": for the model with preferential resampling
- "sampling_u": for the model with uniform resampling
- "weights": for the model with reweighting
- "roc-pivot": for the model with switches close to the decision frontier


In [213]:
SELECTED_MODEL = "base"

In [214]:
display_groups_fairness(SELECTED_MODEL, explainers_fairness, METRICS_LIST)


# Manual version:

## First results
Looking for bias

In [28]:
# Define Dalex parameters
explainer = dx.Explainer(eval(dict_infos['model_name']), X, y, verbose=False)

protected = dict_infos['protected']
privileged = dict_infos['privileged']

In [109]:
#fairness_object = explainer.model_fairness(protected = protected, privileged = privileged)

fairness_object.fairness_check(epsilon = 0.8)
fairness_object.plot()
xx

Bias detected in 2 metrics: FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'female_old'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
                   TPR       ACC       PPV       FPR       STP
female_young  0.818278  0.841191  0.862027  0.815552  0.762874
male_old      0.995749  0.941687  0.956044  1.374322  1.068263
male_young    0.952179  0.918114  0.926740  1.041591  0.946108


((<dalex._explainer.object.Explainer at 0x19226db5970>,
  <dalex.fairness._group_fairness.object.GroupFairnessClassification at 0x19226b22580>),
 (<dalex._explainer.object.Explainer at 0x1922681d580>,
  <dalex.fairness._group_fairness.object.GroupFairnessClassification at 0x1922681d8b0>),
 (<dalex._explainer.object.Explainer at 0x192267f4f10>,
  <dalex.fairness._group_fairness.object.GroupFairnessClassification at 0x19226a59a60>),
 (<dalex._explainer.object.Explainer at 0x19226a597c0>,
  <dalex.fairness._group_fairness.object.GroupFairnessClassification at 0x19226a2a0d0>),
 (<dalex._explainer.object.Explainer at 0x19226a2aeb0>,
  <dalex.fairness._group_fairness.object.GroupFairnessClassification at 0x19226a3b700>),
 (<dalex._explainer.object.Explainer at 0x19226a3eeb0>,
  <dalex.fairness._group_fairness.object.GroupFairnessClassification at 0x19226db5940>))

__Doing nothing. Is that so bad, really ?__

If all bars are within (0.8, 1.25), then according to your criteria you model is not biased. However, if you set a threshold for the age, are you sure that moving it a little bit (± 5 years) will not return a biased result ?

Try it to be sure !

__If a biased have been detected, have a look to see how you can mitigate it !__

## Mitigate in Preprocessing

### Mitigate option 1: Just remove the biased variable ?

Let's try the same but set the biased variable out of the training part.

In [45]:
explainer_rm = dx.Explainer(eval(dict_infos['model_rm_name']), X, y, verbose=False)
fobject_rm = explainer_rm.model_fairness(protected = protected, privileged = privileged, label='with_removed_columns')

fobject_rm.fairness_check(epsilon = 0.8)
fobject_rm.plot(objects=[fairness_object])

Bias detected in 2 metrics: ACC, PPV

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'female_old'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
                   TPR       ACC       PPV       FPR       STP
female_young  0.937500  0.798278  0.788957  1.215889  0.956776
male_old      0.925000  0.928659  0.986503  1.091537  0.960280
male_young    0.940625  0.897909  0.916564  1.072539  0.946262


How did you model evolved regarding to your fairness metrics ? Is it better without the column ? 

If yes, you're lucky the kind of naive preprocessing used to be useless most of the time. Usually the protected and biased variable is correlated with others explanatory variables and then removing it do not hel to unbiased your model ! 

If no, well that kind of normal, let's see more appropriate ways to deal with biased models.

### Mitigate option 2: Re-sampling your data
Did you look at the distribution of the biased variable ? Maybe some values of the variable are under-represented or over-represented. Resampling training data would help to mitigate bias due to this king of issue.

In [46]:
model_p = resampling_model(eval(dict_infos['model_name']), explainer, protected, type_resampling="preferential")
model_u = resampling_model(eval(dict_infos['model_name']), explainer, protected, type_resampling="uniform")

explainer_p = dx.Explainer(model_p, X, y, verbose = False)
explainer_u = dx.Explainer(model_u, X, y, verbose = False)

fobject_p = explainer_p.model_fairness(protected, privileged, label='preferential_resampling')
fobject_u = explainer_u.model_fairness(protected, privileged, label='uniform_resampling')

# plotting
fairness_object.plot([fobject_p, fobject_u])

### Mitigate option 3: Re-weighting your data

In [48]:
model_w = reweighting_model(eval(dict_infos['model_name']))

explainer_w = dx.Explainer(model_w, X, y, verbose = False)

fobject_w = explainer_w.model_fairness(protected, privileged, label='reweighted')

fairness_object.plot([fobject_p, fobject_w])


In [49]:
fairness_object.plot([fobject_p, fobject_u, fobject_w], type = "radar", metrics=['FDR', 'STP', 'TPR', 'TNR', 'FOR'])

## Mitigate in Postprocessing

### Mitigate option 4: ROC-pivot method

This method of mitigation aims to change predictions for items close to the decision frontier.  
It switches labels if an observation is from the unprivileged group and on the left (wrong side) of the cutoff.
Note that It can also switches labels if an observation is from the privileged group and on the right of the cutoff.

In [56]:
explainer_roc = dx.Explainer(eval(dict_infos['model_name']), X, y, verbose=False)
explainer_roc = dx.fairness.roc_pivot(explainer_roc, protected, privileged, theta = 0.02, verbose = False)
fobject_roc = explainer_roc.model_fairness(protected, privileged, label='roc')

fairness_object.plot([fobject_roc, fobject_u, fobject_w])

In [76]:
#%%capture
xx = create_all_explainers_fairness_objects(dict_infos)
    

-
Bias detected in 2 metrics: FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'female_old'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
                   TPR       ACC       PPV       FPR       STP
female_young  0.818278  0.841191  0.862027  0.815552  0.762874
male_old      0.995749  0.941687  0.956044  1.374322  1.068263
male_young    0.952179  0.918114  0.926740  1.041591  0.946108
---
Bias detected in 2 metrics: ACC, PPV

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'female_old'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
                   TPR       ACC       PPV       FPR       STP
female_young  0.937500  0.798278  0.788957  1.215889  0.956776
male_old      0.925000  0.928659  0.986503  1.091537  0.960280
male_youn

In [None]:
from copy import copy
clf_u = copy(clf_logreg)
clf_p = copy(clf_logreg)


In [None]:
indices_uniform = dx.fairness.resample(protected, y, verbose = False)
indices_preferential = dx.fairness.resample(protected,
                                y, 
                                type = 'preferential', # different type 
                                probs = explainer.y_hat, # requires probabilities
                                verbose = False)


clf_u.fit(X.iloc[indices_uniform, :], y[indices_uniform])
clf_p.fit(X.iloc[indices_preferential, :], y[indices_preferential])

In [None]:
weights = dx.fairness.reweight(protected, y, verbose = False)

clf_weighted = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(random_state=123))])

kwargs = {clf_weighted.steps[-1][0] + '__sample_weight': weights}

clf_weighted.fit(X,y, **kwargs)

In [None]:
exp3 = dx.Explainer(clf_u, X, y, verbose = False)
exp4 = dx.Explainer(clf_p, X, y, verbose = False)
exp5 = dx.Explainer(clf_weighted, X, y, verbose = False)


fobject1 = explainer.model_fairness(protected, privileged, label='base')
fobject2 = explainer_no_sex.model_fairness(protected, privileged, label='remove')
fobject3 = exp3.model_fairness(protected, privileged, label='res_unif')
fobject4 = exp4.model_fairness(protected, privileged, label='res_pref')
fobject5 = exp5.model_fairness(protected, privileged, label='reweighted')


# plotting
fobject1.plot([fobject2, fobject4, fobject3, fobject5])