In [209]:
import xgboost as xgb
import numpy as np
import pandas as pd
import optuna
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [225]:
df = pd.read_csv("normal_dist_data.csv").drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,country,age,time,mood,sugar_level,coffee_type
0,DE,16,morning,Excited,2,Americano
1,IN,19,evening,Happy,2,Espresso
2,IN,54,evening,Stressed,2,Cappuccino
3,IN,16,evening,Excited,1,Espresso
4,ES,44,evening,Stressed,1,Americano


In [229]:
# encode categorical features
categorical_columns = ["country", "time", "mood", "coffee_type"]
numerical_columns = ["age", "sugar_level"]
df_encoded = pd.DataFrame({})
label_encoder = LabelEncoder()
for feature in categorical_columns:
    df_encoded[feature] = label_encoder.fit_transform(df[feature])
df_encoded = pd.concat([df[numerical_columns], df_encoded], axis=1)
df_encoded.head()

Unnamed: 0,age,sugar_level,country,time,mood,coffee_type
0,16,2,2,2,0,0
1,19,2,6,1,1,2
2,54,2,6,1,4,1
3,16,1,6,1,0,2
4,44,1,3,1,4,0


In [227]:
categorical_columns = ["country", "time", "mood"]
numerical_columns = ["age", "sugar_level"]

encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

one_hot_encoded = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([df[numerical_columns], one_hot_encoded], axis=1)

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(df["coffee_type"])

df_encoded["coffee_type"] = y

In [228]:
df_encoded

Unnamed: 0,age,sugar_level,country_AU,country_CA,country_DE,country_ES,country_FR,country_GB,country_IN,country_IT,...,country_US,time_afternoon,time_evening,time_morning,mood_Excited,mood_Happy,mood_Neutral,mood_Sad,mood_Stressed,coffee_type
0,16,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0
1,19,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2
2,54,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,16,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2
4,44,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,16,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
996,25,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
997,40,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2
998,65,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2


In [233]:
# divide the dataset into test and train set
train, test = train_test_split(df_encoded, test_size=0.2)

In [242]:
X = df_encoded.drop("coffee_type", axis=1)
y = df_encoded["coffee_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

"""
Xtrain = train.drop(["coffee_type"], axis=1)
ytrain = train["coffee_type"]

X_train, X_val, y_train, y_val = train_test_split(Xtrain, ytrain, test_size=0.2)

X_test = test.drop(["coffee_type"], axis=1)
y_test = test["coffee_type"]
"""

In [243]:
# objective function testing params on the xgb model

def objective(trial):
    params = {
        'verbosity': 0,
        'objective': 'multi:softmax',
        'num_class': len(set(y)),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_float('eta', 0.01, 1.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_train)
    accuracy = accuracy_score(y_train, preds)
    return accuracy

In [244]:
# make a study to find the best params

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Affichage des meilleurs hyperparamètres
print("Meilleurs hyperparamètres:", study.best_trial.params)

[I 2024-04-30 19:48:32,290] A new study created in memory with name: no-name-7ff718e1-e0a0-4095-8854-96e64327127a
[I 2024-04-30 19:48:32,347] Trial 0 finished with value: 0.7186666666666667 and parameters: {'booster': 'gbtree', 'lambda': 0.00010876706899346307, 'alpha': 4.3387764987173545e-05, 'max_depth': 8, 'eta': 0.4309098519903587, 'gamma': 0.38856892896038, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.7186666666666667.
[I 2024-04-30 19:48:34,911] Trial 1 finished with value: 0.968 and parameters: {'booster': 'dart', 'lambda': 0.001054558025979313, 'alpha': 1.1221333440534847e-07, 'max_depth': 9, 'eta': 0.815007952705715, 'gamma': 2.108607162409623e-07, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:48:34,989] Trial 2 finished with value: 0.47333333333333333 and parameters: {'booster': 'gbtree', 'lambda': 1.172677601754145e-08, 'alpha': 0.1452305405446275, 'max_depth': 3, 'eta': 0.031201057814550043, 'gamma': 0.10832179212503117, 'gro

[I 2024-04-30 19:49:16,921] Trial 27 finished with value: 0.968 and parameters: {'booster': 'dart', 'lambda': 5.523852695857918e-08, 'alpha': 7.315776408978187e-08, 'max_depth': 9, 'eta': 0.9904211448666581, 'gamma': 1.5015026673350752e-06, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:49:18,713] Trial 28 finished with value: 0.8253333333333334 and parameters: {'booster': 'dart', 'lambda': 5.317903937452213e-06, 'alpha': 1.6009012219974928e-06, 'max_depth': 4, 'eta': 0.23462573473103157, 'gamma': 1.510598920500137e-05, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:49:18,948] Trial 29 finished with value: 0.964 and parameters: {'booster': 'gbtree', 'lambda': 8.430155737896573e-05, 'alpha': 3.654234508688194e-05, 'max_depth': 8, 'eta': 0.1269062978913244, 'gamma': 2.0506228802024935e-07, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:49:20,795] Trial 30 finished with value: 0.957333333

[I 2024-04-30 19:50:02,400] Trial 54 finished with value: 0.968 and parameters: {'booster': 'dart', 'lambda': 0.00011643850818205405, 'alpha': 6.679668344161185e-05, 'max_depth': 8, 'eta': 0.3925703941909113, 'gamma': 5.888717188635285e-08, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:50:04,514] Trial 55 finished with value: 0.968 and parameters: {'booster': 'dart', 'lambda': 4.53073430255049e-08, 'alpha': 0.0005198503551904872, 'max_depth': 9, 'eta': 0.2722000578668834, 'gamma': 4.812972238979271e-05, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:50:06,290] Trial 56 finished with value: 0.968 and parameters: {'booster': 'dart', 'lambda': 1.9147667446962445e-06, 'alpha': 2.9039095580183492e-08, 'max_depth': 7, 'eta': 0.8255531981151742, 'gamma': 1.4644562400573e-05, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:50:06,587] Trial 57 finished with value: 0.968 and parameters: {'booste

[I 2024-04-30 19:50:41,753] Trial 81 finished with value: 0.968 and parameters: {'booster': 'dart', 'lambda': 2.7730745759802646e-08, 'alpha': 0.0002834592063209825, 'max_depth': 9, 'eta': 0.878648266814399, 'gamma': 4.346980978084776e-08, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:50:43,554] Trial 82 finished with value: 0.968 and parameters: {'booster': 'dart', 'lambda': 1.0564944774932835e-08, 'alpha': 0.07461844677427483, 'max_depth': 9, 'eta': 0.6693922287373637, 'gamma': 6.022278146028785e-08, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:50:45,397] Trial 83 finished with value: 0.968 and parameters: {'booster': 'dart', 'lambda': 8.055062308565183e-06, 'alpha': 2.74640054130405e-05, 'max_depth': 9, 'eta': 0.9954886344756959, 'gamma': 1.5666438534094074e-08, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.968.
[I 2024-04-30 19:50:47,790] Trial 84 finished with value: 0.968 and parameters: {'booster

Meilleurs hyperparamètres: {'booster': 'dart', 'lambda': 0.001054558025979313, 'alpha': 1.1221333440534847e-07, 'max_depth': 9, 'eta': 0.815007952705715, 'gamma': 2.108607162409623e-07, 'grow_policy': 'lossguide'}


In [245]:
# Fit the model with best params

best_params = study.best_trial.params
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)

In [246]:
# Make some predictions on the test set

predictions = model.predict(X_test)
predictions_labels = label_encoder.inverse_transform(predictions)

accuracy = accuracy_score(y_test, predictions)
print(f"accuracy : {accuracy}")
# print(f"Prédictions values : {predictions_labels}")
# print(f"Real values : {label_encoder.inverse_transform(y_test)}")

accuracy : 0.276


## Testing MLP model

In [239]:
from sklearn.neural_network import MLPClassifier

X = df_encoded.drop("coffee_type", axis=1)
y = df_encoded["coffee_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
clf = MLPClassifier(random_state=1, max_iter=300)

In [240]:
clf.fit(X_train, y_train)
pred = clf.predict(X_train)
display(f"Train accuracy : {accuracy_score(y_train, pred)}")

'Train accuracy : 0.38133333333333336'

In [241]:
pred = clf.predict(X_test)
display(f"Test accuracy : {accuracy_score(y_test, pred)}")

'Test accuracy : 0.328'