In [42]:
import xgboost as xgb
import numpy as np
import pandas as pd
import optuna
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [43]:
# load dataset

df = pd.read_csv("../../CoffeeCraft-server/backend/app/ml/data/model_data.csv")

X = df.drop('coffee_type', axis=1)
y = df['coffee_type']

df.head()

Unnamed: 0,country,age,date,mood,sugar_level,coffee_type
0,US,25,morning,happy,medium,espresso
1,ES,34,afternoon,normal,low,latte
2,IT,45,evening,depressed,high,americano
3,US,23,morning,ecstatic,low,cappuccino
4,IT,35,evening,sad,medium,espresso


In [45]:
# encode categorical features

categorical_features = ['country', 'date', 'mood', 'sugar_level']
label_encoder = LabelEncoder()
for feature in categorical_features:
    X[feature] = label_encoder.fit_transform(X[feature])
X["age"] = X["age"].astype(int)
y = label_encoder.fit_transform(y)
display(X.head())
display(y)

SyntaxError: unexpected EOF while parsing (3278831806.py, line 10)

In [33]:
# divide the dataset into test and train set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# objective function testing params on the xgb model

def objective(trial):
    params = {
        'verbosity': 0,
        'objective': 'multi:softmax',
        'num_class': len(set(y)),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_float('eta', 0.01, 1.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

In [35]:
# make a study to find the best params

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Affichage des meilleurs hyperparamètres
print("Meilleurs hyperparamètres:", study.best_trial.params)

[I 2024-04-09 18:30:44,066] A new study created in memory with name: no-name-8fb32696-f265-4d8c-999e-f46c5cd2e826
[I 2024-04-09 18:30:44,696] Trial 0 finished with value: 0.4 and parameters: {'booster': 'dart', 'lambda': 0.006905052191130745, 'alpha': 1.5562144266780798e-07, 'max_depth': 7, 'eta': 0.1658540356016752, 'gamma': 9.420379295292709e-08, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.4.
[I 2024-04-09 18:30:44,770] Trial 1 finished with value: 0.4 and parameters: {'booster': 'gbtree', 'lambda': 0.0007348248462766141, 'alpha': 0.0014005454694491811, 'max_depth': 9, 'eta': 0.22150345363627147, 'gamma': 5.632664409654516e-06, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.4.
[I 2024-04-09 18:30:45,358] Trial 2 finished with value: 0.6 and parameters: {'booster': 'dart', 'lambda': 2.9502099191213237e-06, 'alpha': 0.00016907109801160745, 'max_depth': 6, 'eta': 0.013257542765623603, 'gamma': 3.8787998253830526e-07, 'grow_policy': 'lossguide'}. Best is trial 

[I 2024-04-09 18:30:54,570] Trial 27 finished with value: 0.6 and parameters: {'booster': 'dart', 'lambda': 0.00023190708732000054, 'alpha': 0.010269990993488269, 'max_depth': 4, 'eta': 0.057809431188922365, 'gamma': 0.1288624945801765, 'grow_policy': 'lossguide'}. Best is trial 2 with value: 0.6.
[I 2024-04-09 18:30:54,650] Trial 28 finished with value: 0.4 and parameters: {'booster': 'gbtree', 'lambda': 0.26572542567060853, 'alpha': 4.6973771316159135e-06, 'max_depth': 8, 'eta': 0.119602922503855, 'gamma': 0.0012142723536296062, 'grow_policy': 'lossguide'}. Best is trial 2 with value: 0.6.
[I 2024-04-09 18:30:55,221] Trial 29 finished with value: 0.6 and parameters: {'booster': 'dart', 'lambda': 0.0047472453905907696, 'alpha': 1.9225049328402188e-07, 'max_depth': 7, 'eta': 0.016545539783862296, 'gamma': 7.26458454677853e-05, 'grow_policy': 'lossguide'}. Best is trial 2 with value: 0.6.
[I 2024-04-09 18:30:55,774] Trial 30 finished with value: 0.6 and parameters: {'booster': 'dart', '

Meilleurs hyperparamètres: {'booster': 'dart', 'lambda': 2.9502099191213237e-06, 'alpha': 0.00016907109801160745, 'max_depth': 6, 'eta': 0.013257542765623603, 'gamma': 3.8787998253830526e-07, 'grow_policy': 'lossguide'}


In [36]:
# Fit the model with best params

best_params = study.best_trial.params
model = xgb.XGBClassifier(**best_params, num_class=len(set(y)))
model.fit(X_train, y_train)

In [41]:
# Make some predictions on the test set

predictions = model.predict(X_test)
predictions_labels = label_encoder.inverse_transform(predictions)

accuracy = accuracy_score(y_test, predictions)
print(f"accuracy : {accuracy}")
print(f"Prédictions values : {predictions_labels}")
print(f"Real values : {label_encoder.inverse_transform(y_test)}")



accuracy : 0.6
Prédictions values : ['latte' 'cappuccino' 'espresso' 'espresso' 'latte']
Real values : ['espresso' 'cappuccino' 'espresso' 'latte' 'latte']
