# Hyperparameter Optimisation for Machine Learning Practical Lab

### Chair for Artificial Intelligence Methodology (AIM), RWTH Aachen University.

# Assignment 3: Fair and Green Hyperparameter Optimization via Multi-objective and Multiple Information Source Bayesian Optimization

This Notebook contains the results from our implementation of the Fair and Green Hyperparameter Optimization via Multi-objective and
Multiple Information Source Bayesian Optimization as described in the paper by Candeleri et. al.

In [77]:
import warnings
from sklearn.exceptions import ConvergenceWarning

In [78]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.simplefilter(action="ignore")

In [79]:
import os
import sys

In [80]:
directory_path = os.path.abspath(os.path.join("."))
if directory_path not in sys.path:
    sys.path.append(directory_path)

## Importing Libraries

In [81]:
import pandas as pd
import numpy as np
import json
import pickle

# visualization libraries
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [82]:
%matplotlib inline

In [83]:
# ML methods
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [84]:
# configspace
from ConfigSpace import ConfigurationSpace
from ConfigSpace import Categorical, Float, Integer
from ConfigSpace.hyperparameters import NormalFloatHyperparameter
from ConfigSpace.hyperparameters import UniformIntegerHyperparameter
from ConfigSpace.hyperparameters import Constant
from ConfigSpace.conditions import GreaterThanCondition, InCondition
from ConfigSpace.api.distributions import Normal, Beta, Uniform

In [85]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.base import clone

In [86]:
import concurrent.futures

In [87]:
%load_ext autoreload
%autoreload 2
from src.utils import evaluated_params_data, create_comparable_random_search
from src.utils import change_categorical_to_one_hot_encoded, lower_dimensional_representation_MDS
from src.utils import calculate_average_and_error_on_dict
from src.utils import plot_averages_with_errors
from src.utils import remove_specific_keys, create_dataframe_from_dict

from src.utils import load_fair_data, create_dataframe_and_dict_from_dict
from src.fang_hpo_core import fang_hpo

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
EVALUATION_CONFIG = True
RERUN_EXPERIMENTS = True
### random generator initializing ###
RANDOM_SEED = 4242
np.random.seed(RANDOM_SEED)
random_state_low_budget = np.random.randint(0, 100000)
seeds = np.random.randint(low=10, high=2**31, size=5) if EVALUATION_CONFIG else [42, 43]

In [89]:
def ask_user():
    while True:
        response = (
            input("Are you sure you want to overwrite the data? (yes/no): ")
            .strip()
            .lower()
        )
        return response

## The Data as used in the paper

In [90]:
file_details = {
    "ADULT_full": {"target_column": "income.leq.50k", "sensitive_columns": "race.|sex."}
}


# We decided to normalize the data
def normalize_data(X_train, X_test, cloumns_to_normalize):
    scaler = StandardScaler()
    X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])
    return X_train, X_test


all_data = {}
for file_name, details in file_details.items():
    # load data
    data_x, data_y, data_sensitive = load_fair_data(
        "data/original_paper_data/{}.txt".format(file_name), **details
    )
    data_protected_group = data_sensitive
    # split data
    (
        data_x_train,
        data_x_test,
        data_y_train,
        data_y_test,
        data_protected_group_train,
        data_protected_group_test,
    ) = train_test_split(
        data_x,
        data_y,
        data_protected_group,
        train_size=0.8,
        random_state=RANDOM_SEED + 9971,
    )
    # normalize data
    columns_to_normalize = data_x.columns[((data_x < 0) | (data_x > 1)).any()]
    data_x_train, data_x_test = normalize_data(
        data_x_train, data_x_test, columns_to_normalize
    )

    # Load all data into a dictionary of dictionaries
    all_data[file_name] = {
        "x_train": data_x_train,
        "x_test": data_x_test,
        "y_train": data_y_train,
        "y_test": data_y_test,
        "protected_group_train": data_protected_group_train,
        "protected_group_test": data_protected_group_test,
    }

In [91]:
all_data["ADULT_full"]["protected_group_train"]

Unnamed: 0,race.White,race.Asian.Pac.Islander,race.Amer.Indian.Eskimo,race.Other,sex.Female
19604,1,0,0,0,0
1825,1,0,0,0,0
25843,0,0,0,0,1
18871,0,0,0,1,1
18058,0,0,1,0,0
...,...,...,...,...,...
25679,1,0,0,0,1
5154,1,0,0,0,0
26554,1,0,0,0,0
12269,1,0,0,0,0


In [92]:
RF = RandomForestClassifier()
MLP = MLPClassifier(
    max_iter=80, early_stopping=True, n_iter_no_change=5
)  # TODO: change back

# Define the configuration space for the Random Forest
cls_1_config_space = ConfigurationSpace(
    name="Random_Forest_config_space",
    space={
        "n_estimators": Integer("n_estimators", bounds=(1, 200)),
        "criterion": Categorical("criterion", ["gini", "entropy", "log_loss"]),
        "max_depth": Integer(
            "max_depth", bounds=(2, 1000), distribution=Normal(mu=15, sigma=20)
        ),
        "min_samples_leaf": Float(
            "min_samples_leaf",
            bounds=(0.05, 0.3),
            distribution=Beta(alpha=2, beta=5),
        ),
        "max_samples": Categorical("max_samples", [0.6, 0.8, 1]),
    },
)

# Define the configuration space for the MLP
cls_2_config_space = ConfigurationSpace(  # TODO: reevaluate the bounds
    name="MLP_config_space",
    space={
        "activation": Categorical("activation", ["relu", "tanh", "logistic"]),
        "solver": Categorical("solver", ["adam", "sgd"]),
        "alpha": Float("alpha", bounds=(1e-5, 0.5), log=True),
        "learning_rate": Categorical("learning_rate", ["constant", "adaptive"]),
    },
)

# The layer configuration space for the MLP is special, as the number of layers is a hyperparameter
n_layers = UniformIntegerHyperparameter("n_layers", lower=1, upper=4)

# Number of neurons in each layer are added as hyperparameters
n_neurons_layer1 = UniformIntegerHyperparameter(
    "n_neurons_layer1", lower=2, upper=32, log=True
)
n_neurons_layer2 = UniformIntegerHyperparameter(
    "n_neurons_layer2", lower=2, upper=32, log=True
)
n_neurons_layer3 = UniformIntegerHyperparameter(
    "n_neurons_layer3", lower=2, upper=32, log=True
)
n_neurons_layer4 = UniformIntegerHyperparameter(
    "n_neurons_layer4", lower=2, upper=32, log=True
)
cls_2_config_space.add_hyperparameters(
    [n_layers, n_neurons_layer1, n_neurons_layer2, n_neurons_layer3, n_neurons_layer4]
)

# Define the condition for the number of layers and add it to the configuration space
cond_neurons_layer2 = GreaterThanCondition(n_neurons_layer2, n_layers, 1)
cond_neurons_layer3 = GreaterThanCondition(n_neurons_layer3, n_layers, 2)
cond_neurons_layer4 = GreaterThanCondition(n_neurons_layer4, n_layers, 3)
cls_2_config_space.add_conditions(
    [cond_neurons_layer2, cond_neurons_layer3, cond_neurons_layer4]
)


cls_names = list(map(lambda x: x.__class__.__name__, [RF, MLP]))

classifier_dict = dict(zip(cls_names, [RF, MLP]))


config_space_dict = dict(zip(cls_names, [cls_1_config_space, cls_2_config_space]))

In [97]:
%%time

if RERUN_EXPERIMENTS:
    cummuative_results = pd.DataFrame()
    cummulatitive_config_dict = {}
    Path("data/results/csv/").mkdir(parents=True, exist_ok=True)
    for seed_number, seed in enumerate(seeds):
        if 0 == seed_number:
                continue
        for dataset_name, data_dict in all_data.items():
            #if dataset_name != 'ADULT_full':
                #continue
            data_x = data_dict['x_train']
            data_y = data_dict['y_train']
            data_protected_group = data_dict['protected_group_train']
            
            for cls_name in cls_names:
                fang_results, transformer, hv = fang_hpo(
                    classifier = classifier_dict[cls_name],
                    data_x =  data_x,
                    data_y = data_y,
                    protected_features = data_protected_group,
                    budget = 80, #TODO: check
                    random_state = seed,
                    save_path= "data/results/DeepCAVE/mis", #TODO change
                    config_space = config_space_dict[cls_name],
                    compute_HV = True,
                )
                
                #save the results
                fang_results = remove_specific_keys(fang_results, ['data_x', 'data_y', 'data_protected_group', 'models'])
                
                result_df2, config_dict = create_dataframe_and_dict_from_dict(fang_results, seed, cls_name, dataset_name)
                cummuative_results = pd.concat([cummuative_results, result_df2])
                cummulatitive_config_dict.update(config_dict)
                print("runc")
                np.savetxt(f"data/results/csv/hvs_slides/{cls_name}_{dataset_name}_{seed_number}_hv_presi_sequential2.csv", hv)
                print("first thing saved")


    #save the results
    print("Saving the results")
    cummuative_results.to_csv("data/results/csv/fang_run_seqi3.csv")
    with open("data/results/pickle/fang_seqi3.pkl", "wb") as f:
        pickle.dump(fang_results, f)


started new run with parameters:  RandomForestClassifier(criterion='log_loss', max_depth=72, max_samples=0.6,
                       min_samples_leaf=0.08436168499540403, n_estimators=12,
                       random_state=503025858)
Initial Pf [[2.49451026e-01 2.77000645e-15]
 [2.49451026e-01 2.77000645e-15]
 [2.49451026e-01 2.77000645e-15]
 [2.49451026e-01 2.77000645e-15]
 [2.49451026e-01 2.77000645e-15]
 [2.46103403e-01 2.85882429e-15]
 [2.46103403e-01 2.85882429e-15]
 [2.46103403e-01 2.85882429e-15]
 [2.46103403e-01 2.85882429e-15]
 [2.46103403e-01 2.85882429e-15]]
Start Iteration
Time for EHVI 5.344125900000108
(1, 7) [[ 3.74169718  1.21520818 -1.25808875 -0.31357974  1.          0.
   0.        ]]
src.fang_hpo_core (INFO): Iteration 7.5: [2.49451026e-01 2.77000645e-15] 
src.fang_hpo_core (INFO): Time for iteration 5.903125:
hvs [0.7538965965860688]
Start Iteration
Time for EHVI 4.590031799999451
(1, 7) [[ 3.4654583   1.21520818 -1.31370195  1.08772774  0.          0.
   1.      

AttributeError: 'numpy.ndarray' object has no attribute 'values'

This was only a test run for the FanG-HPO implementation. The error occured later in a not yet adapted utility function that transforms the output for the evaluation preparation.

The logs show the development of the dominated hypervolume in each iteration (list after hvs).