In [209]:
import xgboost as xgb
import numpy as np
import pandas as pd
import optuna
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [225]:
df = pd.read_csv("normal_dist_data.csv").drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,country,age,time,mood,sugar_level,coffee_type
0,DE,16,morning,Excited,2,Americano
1,IN,19,evening,Happy,2,Espresso
2,IN,54,evening,Stressed,2,Cappuccino
3,IN,16,evening,Excited,1,Espresso
4,ES,44,evening,Stressed,1,Americano


In [229]:
# encode categorical features
categorical_columns = ["country", "time", "mood", "coffee_type"]
numerical_columns = ["age", "sugar_level"]
df_encoded = pd.DataFrame({})
label_encoder = LabelEncoder()
for feature in categorical_columns:
    df_encoded[feature] = label_encoder.fit_transform(df[feature])
df_encoded = pd.concat([df[numerical_columns], df_encoded], axis=1)
df_encoded.head()

Unnamed: 0,age,sugar_level,country,time,mood,coffee_type
0,16,2,2,2,0,0
1,19,2,6,1,1,2
2,54,2,6,1,4,1
3,16,1,6,1,0,2
4,44,1,3,1,4,0


In [227]:
categorical_columns = ["country", "time", "mood"]
numerical_columns = ["age", "sugar_level"]

encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

one_hot_encoded = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([df[numerical_columns], one_hot_encoded], axis=1)

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(df["coffee_type"])

df_encoded["coffee_type"] = y

In [228]:
df_encoded

Unnamed: 0,age,sugar_level,country_AU,country_CA,country_DE,country_ES,country_FR,country_GB,country_IN,country_IT,...,country_US,time_afternoon,time_evening,time_morning,mood_Excited,mood_Happy,mood_Neutral,mood_Sad,mood_Stressed,coffee_type
0,16,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0
1,19,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2
2,54,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,16,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2
4,44,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,16,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
996,25,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
997,40,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2
998,65,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2


In [233]:
# divide the dataset into test and train set
train, test = train_test_split(df_encoded, test_size=0.2)

In [242]:
X = df_encoded.drop("coffee_type", axis=1)
y = df_encoded["coffee_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

"""
Xtrain = train.drop(["coffee_type"], axis=1)
ytrain = train["coffee_type"]

X_train, X_val, y_train, y_val = train_test_split(Xtrain, ytrain, test_size=0.2)

X_test = test.drop(["coffee_type"], axis=1)
y_test = test["coffee_type"]
"""

In [247]:
# objective function testing params on the xgb model

def objective(trial):
    params = {
        'verbosity': 0,
        'objective': 'multi:softmax',
        'num_class': len(set(y)),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_float('eta', 0.01, 1.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

In [248]:
# make a study to find the best params

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Affichage des meilleurs hyperparamètres
print("Meilleurs hyperparamètres:", study.best_trial.params)

[I 2024-04-30 19:53:36,912] A new study created in memory with name: no-name-5b3bc2b4-517f-43c4-b272-d207845dcaa8
[I 2024-04-30 19:53:39,197] Trial 0 finished with value: 0.276 and parameters: {'booster': 'dart', 'lambda': 0.00011057000903442877, 'alpha': 0.0005110939483348018, 'max_depth': 3, 'eta': 0.5880687674349577, 'gamma': 0.0012013412520845584, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.276.
[I 2024-04-30 19:53:41,061] Trial 1 finished with value: 0.276 and parameters: {'booster': 'dart', 'lambda': 2.0629348785760483e-06, 'alpha': 0.00043764963200646693, 'max_depth': 5, 'eta': 0.012611332853381103, 'gamma': 0.00013554541424154768, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.276.
[I 2024-04-30 19:53:42,754] Trial 2 finished with value: 0.22 and parameters: {'booster': 'dart', 'lambda': 5.9439114494584083e-08, 'alpha': 6.297527975300353e-06, 'max_depth': 3, 'eta': 0.16730487836010302, 'gamma': 4.2090048751301565e-08, 'grow_policy': 'depthwise'}. Best

[I 2024-04-30 19:53:53,643] Trial 27 finished with value: 0.28 and parameters: {'booster': 'gbtree', 'lambda': 1.7480653415965376e-06, 'alpha': 6.32319694208874e-08, 'max_depth': 5, 'eta': 0.10390633908609992, 'gamma': 0.0029926436511525916, 'grow_policy': 'lossguide'}. Best is trial 12 with value: 0.312.
[I 2024-04-30 19:53:53,758] Trial 28 finished with value: 0.312 and parameters: {'booster': 'gbtree', 'lambda': 8.993717273507845e-07, 'alpha': 2.8017797701956313e-06, 'max_depth': 4, 'eta': 0.016112145239030855, 'gamma': 0.022738663301417678, 'grow_policy': 'lossguide'}. Best is trial 12 with value: 0.312.
[I 2024-04-30 19:53:53,829] Trial 29 finished with value: 0.308 and parameters: {'booster': 'gbtree', 'lambda': 5.022080087864638e-05, 'alpha': 0.00019445125672014392, 'max_depth': 3, 'eta': 0.015164600685272613, 'gamma': 0.0005476245585310972, 'grow_policy': 'depthwise'}. Best is trial 12 with value: 0.312.
[I 2024-04-30 19:53:53,911] Trial 30 finished with value: 0.308 and parame

[I 2024-04-30 19:54:05,912] Trial 54 finished with value: 0.3 and parameters: {'booster': 'gbtree', 'lambda': 1.4267676439368392e-05, 'alpha': 2.0403649738208356e-05, 'max_depth': 4, 'eta': 0.023763904949328576, 'gamma': 0.035101151499400796, 'grow_policy': 'lossguide'}. Best is trial 12 with value: 0.312.
[I 2024-04-30 19:54:06,179] Trial 55 finished with value: 0.316 and parameters: {'booster': 'gbtree', 'lambda': 4.3546343777895095e-06, 'alpha': 4.804898486546243e-07, 'max_depth': 6, 'eta': 0.028893379537182132, 'gamma': 0.001258222243348886, 'grow_policy': 'lossguide'}. Best is trial 55 with value: 0.316.
[I 2024-04-30 19:54:06,448] Trial 56 finished with value: 0.296 and parameters: {'booster': 'gbtree', 'lambda': 4.491582340712357e-06, 'alpha': 8.352171511906719e-07, 'max_depth': 6, 'eta': 0.03838213555372933, 'gamma': 0.0017167671342072424, 'grow_policy': 'lossguide'}. Best is trial 55 with value: 0.316.
[I 2024-04-30 19:54:06,895] Trial 57 finished with value: 0.284 and paramet

[I 2024-04-30 19:54:17,824] Trial 81 finished with value: 0.312 and parameters: {'booster': 'gbtree', 'lambda': 0.00015609707350782223, 'alpha': 0.00011942936301341, 'max_depth': 3, 'eta': 0.016210858792823316, 'gamma': 0.0003002079175216812, 'grow_policy': 'depthwise'}. Best is trial 58 with value: 0.32.
[I 2024-04-30 19:54:18,034] Trial 82 finished with value: 0.296 and parameters: {'booster': 'gbtree', 'lambda': 0.00017231900338191924, 'alpha': 4.36594961183136e-05, 'max_depth': 3, 'eta': 0.02747109169103338, 'gamma': 0.0002983466671785168, 'grow_policy': 'depthwise'}. Best is trial 58 with value: 0.32.
[I 2024-04-30 19:54:18,323] Trial 83 finished with value: 0.272 and parameters: {'booster': 'gbtree', 'lambda': 0.9397535758625877, 'alpha': 0.0004061761661091484, 'max_depth': 4, 'eta': 0.023763047211407345, 'gamma': 0.0001730663630478944, 'grow_policy': 'lossguide'}. Best is trial 58 with value: 0.32.
[I 2024-04-30 19:54:18,412] Trial 84 finished with value: 0.312 and parameters: {

Meilleurs hyperparamètres: {'booster': 'gbtree', 'lambda': 0.025264724316604294, 'alpha': 4.2110496859401065e-07, 'max_depth': 6, 'eta': 0.0785032162186483, 'gamma': 0.014646734388079301, 'grow_policy': 'lossguide'}


In [249]:
# Fit the model with best params

best_params = study.best_trial.params
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)

In [250]:
# Make some predictions on the test set

predictions = model.predict(X_test)
predictions_labels = label_encoder.inverse_transform(predictions)

accuracy = accuracy_score(y_test, predictions)
print(f"accuracy : {accuracy}")
# print(f"Prédictions values : {predictions_labels}")
# print(f"Real values : {label_encoder.inverse_transform(y_test)}")

accuracy : 0.32


## Testing MLP model

In [251]:
from sklearn.neural_network import MLPClassifier

X = df_encoded.drop("coffee_type", axis=1)
y = df_encoded["coffee_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
clf = MLPClassifier(random_state=1, max_iter=300)

In [252]:
clf.fit(X_train, y_train)
pred = clf.predict(X_train)
display(f"Train accuracy : {accuracy_score(y_train, pred)}")

'Train accuracy : 0.38133333333333336'

In [253]:
pred = clf.predict(X_test)
display(f"Test accuracy : {accuracy_score(y_test, pred)}")

'Test accuracy : 0.328'