In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import optuna
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("normal_dist_data.csv").drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,country,age,time,mood,sugar_level,coffee_type
0,DE,16,morning,Excited,2,Americano
1,IN,19,evening,Happy,2,Espresso
2,IN,54,evening,Stressed,2,Cappuccino
3,IN,16,evening,Excited,1,Espresso
4,ES,44,evening,Stressed,1,Americano


In [3]:
# encode categorical features
categorical_columns = ["country", "time", "mood", "coffee_type"]
numerical_columns = ["age", "sugar_level"]
df_encoded = pd.DataFrame({})
label_encoder = LabelEncoder()
for feature in categorical_columns:
    df_encoded[feature] = label_encoder.fit_transform(df[feature])
df_encoded = pd.concat([df[numerical_columns], df_encoded], axis=1)
df_encoded.head()

Unnamed: 0,age,sugar_level,country,time,mood,coffee_type
0,16,2,2,2,0,0
1,19,2,6,1,1,2
2,54,2,6,1,4,1
3,16,1,6,1,0,2
4,44,1,3,1,4,0


In [4]:
categorical_columns = ["country", "time", "mood"]
numerical_columns = ["age", "sugar_level"]

encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

one_hot_encoded = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([df[numerical_columns], one_hot_encoded], axis=1)

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(df["coffee_type"])

df_encoded["coffee_type"] = y

In [5]:
df_encoded

Unnamed: 0,age,sugar_level,country_AU,country_CA,country_DE,country_ES,country_FR,country_GB,country_IN,country_IT,...,country_US,time_afternoon,time_evening,time_morning,mood_Excited,mood_Happy,mood_Neutral,mood_Sad,mood_Stressed,coffee_type
0,16,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0
1,19,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2
2,54,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,16,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2
4,44,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,16,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
996,25,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
997,40,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2
998,65,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2


In [6]:
# divide the dataset into test and train set
train, test = train_test_split(df_encoded, test_size=0.2)

In [7]:
X = df_encoded.drop("coffee_type", axis=1)
y = df_encoded["coffee_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

"""
Xtrain = train.drop(["coffee_type"], axis=1)
ytrain = train["coffee_type"]

X_train, X_val, y_train, y_val = train_test_split(Xtrain, ytrain, test_size=0.2)

X_test = test.drop(["coffee_type"], axis=1)
y_test = test["coffee_type"]
"""

'\nXtrain = train.drop(["coffee_type"], axis=1)\nytrain = train["coffee_type"]\n\nX_train, X_val, y_train, y_val = train_test_split(Xtrain, ytrain, test_size=0.2)\n\nX_test = test.drop(["coffee_type"], axis=1)\ny_test = test["coffee_type"]\n'

In [8]:
# objective function testing params on the xgb model

def objective(trial):
    params = {
        'verbosity': 0,
        'objective': 'multi:softmax',
        'num_class': len(set(y)),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_float('eta', 0.01, 1.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

In [9]:
# make a study to find the best params

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Affichage des meilleurs hyperparamètres
print("Meilleurs hyperparamètres:", study.best_trial.params)

[I 2024-06-08 19:28:53,512] A new study created in memory with name: no-name-b8d16168-f7d6-4af7-9c64-39d216349876
[I 2024-06-08 19:28:54,911] Trial 0 finished with value: 0.316 and parameters: {'booster': 'gbtree', 'lambda': 0.4207836159105567, 'alpha': 0.002674855990116001, 'max_depth': 6, 'eta': 0.49742269462522665, 'gamma': 5.715116419367767e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.316.
[I 2024-06-08 19:28:55,237] Trial 1 finished with value: 0.272 and parameters: {'booster': 'gbtree', 'lambda': 0.004410485902316804, 'alpha': 0.0027918437247564823, 'max_depth': 5, 'eta': 0.09519320989989476, 'gamma': 4.5851473239419506e-05, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.316.
[I 2024-06-08 19:28:59,355] Trial 2 finished with value: 0.28 and parameters: {'booster': 'gbtree', 'lambda': 5.547922198728083e-05, 'alpha': 0.005815956434253947, 'max_depth': 9, 'eta': 0.9108164886011635, 'gamma': 3.196391281947093e-05, 'grow_policy': 'lossguide'}. Best is tr

Meilleurs hyperparamètres: {'booster': 'gbtree', 'lambda': 1.2171656818310786e-08, 'alpha': 0.00012263826938770604, 'max_depth': 5, 'eta': 0.30454544043454257, 'gamma': 0.4599172738566427, 'grow_policy': 'depthwise'}


In [10]:
# Fit the model with best params

best_params = study.best_trial.params
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)

In [11]:
# Make some predictions on the test set

predictions = model.predict(X_test)
predictions_labels = label_encoder.inverse_transform(predictions)

accuracy = accuracy_score(y_test, predictions)
print(f"accuracy : {accuracy}")
# print(f"Prédictions values : {predictions_labels}")
# print(f"Real values : {label_encoder.inverse_transform(y_test)}")

accuracy : 0.324


## Testing MLP model

In [12]:
from sklearn.neural_network import MLPClassifier

X = df_encoded.drop("coffee_type", axis=1)
y = df_encoded["coffee_type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
clf = MLPClassifier(random_state=1, max_iter=300)

In [13]:
clf.fit(X_train, y_train)
pred = clf.predict(X_train)
display(f"Train accuracy : {accuracy_score(y_train, pred)}")

'Train accuracy : 0.448'

In [14]:
pred = clf.predict(X_test)
display(f"Test accuracy : {accuracy_score(y_test, pred)}")

'Test accuracy : 0.316'