In [14]:
import dalex as dx
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

## Dataset

##### Utilizando como dataset de análise o banco de dados presente na biblioteca Dalex que contém os dados de RH de uma empresa, indicando uma futura situação de promoção, demissão ou estagnação (no mesmo cargo), de funcionários. Para fins de análise do exemplo dado, iremos apenas 

In [15]:
data = dx.datasets.load_hr()

# status is the target
X = data.drop(columns='status')
y = data.status

# Transformação da coluna str 'status' que continha (promoted, fired, ou ok) em uma coluna com dados numéricos em que: 0 = ok, 1 = promoted e 2 = fired
status_list = []
for i in y:
    if i == 'fired':
        status_list.append(0)
    if i == 'promoted':
        status_list.append(1)
    if i == 'ok':
        status_list.append(1)

data['status2'] = status_list

y = data.status2
print(data)

categorical_features = ['gender']
numerical_features = ['age', 'hours', 'evaluation', 'salary']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', 'passthrough', numerical_features)
])

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=7, random_state=123))
])

clf.fit(X, y)

       gender        age      hours  evaluation  salary    status  status2
1        male  32.582669  41.886256           3       1     fired        0
2      female  41.211040  36.343392           2       5     fired        0
3        male  37.705164  36.817179           3       0     fired        0
4      female  30.060513  38.960317           3       2     fired        0
5        male  21.102827  62.154639           5       3  promoted        1
...       ...        ...        ...         ...     ...       ...      ...
9996   female  50.175709  45.582520           5       0  promoted        1
9997   female  59.087273  40.664725           3       0     fired        0
9998   female  51.049296  37.810145           4       0     fired        0
9999     male  36.158740  35.062329           2       3        ok        1
10000  female  57.962541  54.786236           4       4  promoted        1

[7847 rows x 7 columns]


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['gender']),
                                                 ('num', 'passthrough',
                                                  ['age', 'hours', 'evaluation',
                                                   'salary'])])),
                ('classifier',
                 DecisionTreeClassifier(max_depth=7, random_state=123))])

In [16]:
exp = dx.Explainer(clf, X, y)

Preparation of a new explainer is initiated

  -> data              : 7847 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 7847 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x0F756300> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.636, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.995, mean = -1.58e-18, max = 0.975
  -> model_info        : package sklearn

A new explainer has been created!


In [17]:
exp.model_performance().result

Unnamed: 0,recall,precision,f1,accuracy,auc
DecisionTreeClassifier,0.901643,0.8218,0.859872,0.81305,0.900965


#### Check fairness

In [18]:
protected = data.gender + '_' + np.where(data.age < 35, 'young', 'old')

privileged = 'male_young'

In [19]:
fobject = exp.model_fairness(protected = protected, privileged=privileged)

In [20]:
fobject.fairness_check(epsilon = 0.8)

Bias detected in 2 metrics: PPV, FPR

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'male_young'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
                   TPR       ACC       PPV       FPR       STP
female_old    1.046784  1.008363  0.958019  1.287611  1.021038
female_young  1.125146  0.903226  0.706136  1.823009  0.925666
male_old      1.090058  0.946237  0.801938  1.646018  0.943899


#### Outros atributos

In [21]:
fobject.result

Unnamed: 0,TPR,TNR,PPV,NPV,FNR,FPR,FDR,FOR,ACC,STP
female_old,1.046784,0.916021,0.958019,1.185855,0.724138,1.287611,1.549296,0.711735,1.008363,1.021038
female_young,1.125146,0.75969,0.706136,1.560855,0.262069,1.823009,4.84507,0.130102,0.903226,0.925666
male_old,1.090058,0.81137,0.801938,1.460526,0.468966,1.646018,3.591549,0.285714,0.946237,0.943899
male_young,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
