In [1]:
import dalex as dx
import numpy as np
import pandas as pd

In [54]:
file_path = "stackoverflow_full.csv"
survey_data = pd.read_csv(file_path)

survey_data = survey_data.drop(columns=["Unnamed: 0"])
survey_data = survey_data.drop(columns="HaveWorkedWith")

lookup = ["United States of America", "France", "Germany", "Sweden", "Canada", "Spain", "Singapore"]

survey_data[np.isin(survey_data["Country"], lookup)]["Country"] = "Rich"

print(survey_data.head())

ValueError: Item wrong length 7 instead of 73462.

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

X = survey_data.drop(columns='Employed')
y = survey_data.Employed


categorical_features = ['Age', 'Accessibility', 'EdLevel', 'Gender', 'MentalHealth', 'MainBranch', 'Country']
numerical_features = ['YearsCode', 'YearsCodePro', 'PreviousSalary', 'ComputerSkills']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', 'passthrough', numerical_features)
])

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=7, random_state=123))
])

clf.fit(X, y)

In [6]:
exp = dx.Explainer(clf, X, y)

Preparation of a new explainer is initiated

  -> data              : 73462 rows 12 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 73462 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x00000224A88DF820> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.536, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.999, mean = -1.55e-18, max = 0.998
  -> model_info        : package sklearn

A new explainer has been created!


In [7]:
exp.model_performance().result

Unnamed: 0,recall,precision,f1,accuracy,auc
DecisionTreeClassifier,0.827097,0.787951,0.80705,0.787931,0.873348


In [50]:
protected = survey_data.Country

privileged = "United States of America"

In [48]:
fobject = exp.model_fairness(protected = protected, privileged=privileged)

ParameterCheckError: Parameter Check Error, privileged parameter must be in protected vector

In [34]:
fobject.fairness_check(epsilon = 0.8)

No bias was detected!

Conclusion: your model is fair in terms of checked fairness criteria.

Ratios of metrics, based on 'No'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
          TPR       ACC       PPV       FPR       STP
Yes  1.046005  1.026684  1.030496  1.027237  1.074733


In [31]:
fobject.plot()