In [None]:
import sklearn.metrics
import torch
from torch.utils.data import TensorDataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import os
# Move one step in the directory structure to access src
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
from src.torch.torch_utils import *
from src.torch.torch_wrapper import *

In [None]:
df = pd.read_csv('../data/health_lifestyle_dataset_cleaned.csv')

In [None]:
df

In [None]:
regression_target = ['cholesterol', 'calories_consumed']
classification_target = 'disease_risk'

regression_features = df.drop(columns=regression_target).values
classification_features = df.drop(columns=classification_target).values
regression_labels = df[regression_target].values
classification_labels = df[classification_target].values

In [None]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    regression_features, regression_labels, test_size=0.2, random_state=42
)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    classification_features, classification_labels, test_size=0.2, random_state=42
)

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Regression

In [None]:
param_grid = {
    "layers": [3, 4],
    "width": [32, 64],
    "lr": [1e-3],
    "epochs": [20],
    #"loss_fn": [nn.MSELoss, nn.HuberLoss],
    #"optimizer": [torch.optim.SGD, torch.optim.Adam], #
    "activation": [nn.Tanh],
    "batch_size": [16, 32],
    "dropout_rates": [[0.3], [0.5, 0.2]],
}

In [None]:
grid = GridSearchCV(TorchRegressor(), param_grid, cv=3, scoring="r2", n_jobs=-1, verbose=2)
grid.fit(X_train_reg, y_train_reg)

print("Meilleurs paramètres :", grid.best_params_)
print("Score :", grid.best_score_)

Meilleurs paramètres : {'activation': <class 'torch.nn.modules.activation.Tanh'>, 'dropout_rates': [0.5, 0.2], 'epochs': 30, 'layers': 3, 'loss_fn': <class 'torch.nn.modules.loss.HuberLoss'>, 'lr': 0.001, 'width': 64} \
Score : -0.00048325874797827684


10min pour 216 fits \
Meilleurs paramètres : {'activation': <class 'torch.nn.modules.activation.Tanh'>, 'batch_size': 32, 'dropout_rates': [0.5, 0.2], 'epochs': 20, 'layers': 4, 'loss_fn': <class 'torch.nn.modules.loss.MSELoss'>, 'lr': 0.001, 'width': 32} \
Score : -4.377366297199833e-05\

In [None]:
best_model = grid.best_estimator_
hist = best_model.loss_history

In [None]:
y_pred = best_model.predict(X_train_reg)
mse = mean_squared_error(y_train_reg, y_pred)
mae = mean_absolute_error(y_train_reg, y_pred)
r2 = r2_score(y_train_reg, y_pred)
print(f"Train MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

In [None]:
y_pred = best_model.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred)
mae = mean_absolute_error(y_test_reg, y_pred)
r2 = r2_score(y_test_reg, y_pred)
print(f"Test MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

In [None]:
plt.plot(hist, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()

## History

On n'avait pas standadisé les données dcp les loss était énormes (de l'ordre de 1e36). Apres ca, elles sont entre 1900 et 2400.

```python
model_reg = RegressionModel(X_train_reg.shape[1], width=512, dropout=[0]).to(device)
```

Ensuite j'ai essayé avec des architectures plus petites (width=128, puis width=64) et avec du dropout (0.2, 0.5) mais les performances étaient moins bonnes.

Final Evaluation:
Loss: 1.0004
MAE: 0.8677
R2: -0.0003

# Classification

In [None]:
class_weights = compute_class_weights(y_train_clf)

param_grid = {
    "layers": [ 4, 5],
    "width": [128, 256],
    "lr": [1e-3],
    "epochs": [30],
    "class_weights": [torch.tensor([1.0, 4.5])],
    "loss_fn": [nn.CrossEntropyLoss],
    "optimizer": [torch.optim.SGD],
    "activation": [nn.ReLU],
    "batch_size": [16],
    "dropout_rates": [[0.3], [0.5, 0.2]],
}

In [None]:
grid = GridSearchCV(TorchClassifier(), param_grid, cv=skf, scoring="accuracy", n_jobs=-1, verbose=2)
grid.fit(X_train_clf, y_train_clf)

print("Meilleurs paramètres :", grid.best_params_)
print("Score :", grid.best_score_)

Fitting 3 folds for each of 64 candidates, totalling 192 fits \
Meilleurs paramètres : {'activation': <class 'torch.nn.modules.activation.ReLU'>, 'batch_size': 16, 'dropout_rates': [0.0], 'epochs': 30, 'layers': 3, 'loss_fn': <class 'torch.nn.modules.loss.CrossEntropyLoss'>, 'lr': 0.01, 'optimizer': <class 'torch.optim.sgd.SGD'>, 'width': 64}\
Score : 0.7517124999224092

6min pour 108 fits \
Meilleurs paramètres : {'activation': <class 'torch.nn.modules.activation.ReLU'>, 'batch_size': 16, 'class_weights': tensor([0.6651, 2.0138]), 'dropout_rates': [0.3], 'epochs': 30, 'layers': 3, 'loss_fn': <class 'torch.nn.modules.loss.CrossEntropyLoss'>, 'lr': 0.01, 'optimizer': <class 'torch.optim.sgd.SGD'>, 'width': 32} \
Score : 0.7517124999224092

In [None]:
Xsmall = X_test_clf[:8]
import torch
logits = best_model.model(torch.tensor(Xsmall, dtype=torch.float32))
print("logits shape:", logits.shape)  # devrait être (8,2) ou (8,1)
print(logits[:5])

In [None]:
best_model :Skwrapper = grid.best_estimator_
hist = best_model.loss_history

In [None]:
probs = best_model.predict_proba(X_test_clf)
print("max prob class 1:", probs[:, 1].max())
print("mean prob class 1:", probs[:, 1].mean())


In [None]:
y_pred = best_model.predict(X_train_clf)
f1 = f1_score(y_train_clf, y_pred, average='weighted')
accuracy = accuracy_score(y_train_clf, y_pred)
print(f"Train F1: {f1:.4f}, Accuracy: {accuracy:.4f}")

In [None]:
y_pred = best_model.predict(X_test_clf, 0.5830)
f1 = sklearn.metrics.f1_score(y_test_clf, y_pred, average='weighted')
accuracy = sklearn.metrics.accuracy_score(y_test_clf, y_pred)
print(f"Test F1: {f1:.4f}, Accuracy: {accuracy:.4f}")

In [None]:
print(classification_report(y_test_clf, y_pred))

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test_clf, probs[:, 1])
f1 = 2 * precision * recall / (precision + recall + 1e-8)
best_t = thresholds[f1.argmax()]

best_t


In [None]:
sns.heatmap(confusion_matrix(y_test_clf, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion")
plt.show()

In [None]:
plt.plot(hist, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf.fit(X_train_clf, y_train_clf)

In [None]:
y_pred = rf.predict(X_train_clf)
f1 = f1_score(y_train_clf, y_pred, average='weighted')
accuracy = accuracy_score(y_train_clf, y_pred)
print(f"Train F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
y_pred = rf.predict(X_test_clf)
f1 = sklearn.metrics.f1_score(y_test_clf, y_pred, average='weighted')
accuracy = sklearn.metrics.accuracy_score(y_test_clf, y_pred)
print(f"Test F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(classification_report(y_test_clf, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test_clf, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion")
plt.show()