In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)

In [3]:
project_path = Path("..")
data_path = project_path/"datasets"

In [6]:
df_compas = pd.read_csv(data_path/"compas_for_fairlearn.csv")
df_compas.head()

Unnamed: 0,entity_id,sex,age_cat,race,priors_count,c_charge_degree,label_value
0,1,Male,Greater than 45,Other,0,F,0
1,3,Male,25 - 45,African-American,0,F,1
2,4,Male,Less than 25,African-American,4,F,1
3,5,Male,Less than 25,African-American,1,F,0
4,6,Male,25 - 45,Other,2,F,0


In [8]:
df_compas["label_value"].value_counts()

0    3963
1    3251
Name: label_value, dtype: int64

In [35]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
import optuna

optuna.logging.set_verbosity(optuna.logging.ERROR)

In [15]:
df_compas_train, df_compas_test = train_test_split(df_compas, 
                                                   test_size=0.2, 
                                                   stratify=df_compas["label_value"])
df_compas_train, df_compas_val = train_test_split(df_compas_train, 
                                                   test_size=0.2, 
                                                   stratify=df_compas_train["label_value"])

In [52]:
cat_columns_mask = df_compas_train.columns[:-1].map(
    lambda x: True if x in df_compas_train[:-1].select_dtypes(include="object").columns else False
)

In [56]:
def objective(trial):
    # Hyperparameters
    gb_max_iter = trial.suggest_int("max_iter", 20, 200),
    gb_max_depth = trial.suggest_int("max_depth", 2, 100, log=True)
    
    # Defining the model
    classifier_obj = HistGradientBoostingClassifier(max_iter=gb_max_iter,
                                                   max_depth=gb_max_depth,
                                                   categorical_features=cat_columns_mask)
    
    classifier_obj.fit(df_compas_train.drop(columns=["label_value"]),
                      df_compas_train["label_value"])

    score = classifier_obj.score(df_compas_val.drop(columns=["label_value"]),
                                df_compas_val["label_value"])

    return score

In [57]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=-1)



ValueError: could not convert string to float: 'Female'

In [16]:
df_compas_train["label_value"].value_counts(normalize=True)

0    0.549393
1    0.450607
Name: label_value, dtype: float64

In [18]:
df_compas_val["label_value"].value_counts(normalize=True)

0    0.548918
1    0.451082
Name: label_value, dtype: float64

In [17]:
df_compas_test["label_value"].value_counts(normalize=True)

0    0.54955
1    0.45045
Name: label_value, dtype: float64

In [None]:
initial_param_grid = {
    'n_estimators'=[50, 100, 200, 300]
}

initial