In [None]:
import sklearn.metrics
from torch.utils.data import TensorDataset
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import os
# Move one step in the directory structure to access src
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
#from src.torch.torch_utils import *
from src.torch.torch_wrapper import *

In [None]:
df = pd.read_csv('../data/health_lifestyle_dataset_cleaned.csv')

In [None]:
df

In [None]:
regression_target = ['cholesterol', 'calories_consumed']
classification_target = 'disease_risk'

regression_features = df.drop(columns=regression_target).values
classification_features = df.drop(columns=classification_target).values
regression_labels = df[regression_target].values
classification_labels = df[classification_target].values

In [None]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    regression_features, regression_labels, test_size=0.2, random_state=42
)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    classification_features, classification_labels, test_size=0.2, random_state=42
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Regression

In [None]:
param_grid = {
    "layers": [3, 4],
    "width": [64, 128],
    "lr": [1e-2, 1e-3],
    "epochs": [10],
    "loss_fn": [nn.MSELoss, nn.HuberLoss],
    #"optimizer": [torch.optim.SGD],
    "activation": [nn.ReLU, nn.Tanh],
    #"batch_size": [16, 32],
    "dropout_rates": [[0.0], [0.5, 0.2]],
}

grid = GridSearchCV(TorchRegressor(), param_grid, cv=3, scoring="r2", n_jobs=-1, verbose=2)
grid.fit(X_train_reg, y_train_reg)

print("Meilleurs paramètres :", grid.best_params_)
print("Score :", grid.best_score_)

In [None]:
best_model = grid.best_estimator_
hist = best_model.loss_history

In [None]:
#hist = train(model_reg, training_data_reg, optimizer, epochs=30)

In [None]:
y_pred = best_model.predict(X_train_reg)
mse = mean_squared_error(y_train_reg, y_pred)
mae = mean_absolute_error(y_train_reg, y_pred)
r2 = r2_score(y_train_reg, y_pred)
print(f"Train MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

In [None]:
y_pred = best_model.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred)
mae = mean_absolute_error(y_test_reg, y_pred)
r2 = r2_score(y_test_reg, y_pred)
print(f"Test MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

In [None]:
plt.plot(hist, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()

## History

On n'avait pas standadisé les données dcp les loss était énormes (de l'ordre de 1e36). Apres ca, elles sont entre 1900 et 2400.

```python
model_reg = RegressionModel(X_train_reg.shape[1], width=512, dropout=[0]).to(device)
```

Ensuite j'ai essayé avec des architectures plus petites (width=128, puis width=64) et avec du dropout (0.2, 0.5) mais les performances étaient moins bonnes.

Final Evaluation:
Loss: 1.0004
MAE: 0.8677
R2: -0.0003

# Classification

In [None]:
param_grid = {
    "layers": [3, 4],
    "width": [64, 128],
    "lr": [1e-2, 1e-3],
    "epochs": [30],
    "loss_fn": [nn.MSELoss(), nn.HuberLoss()],
    "optimizer": [torch.optim.SGD],
    "activation": [nn.ReLU(), nn.Tanh()],
    "batch_size": [16, 32],
    "dropout_rates": [[0.0], [0.5, 0.2]],
}

grid = GridSearchCV(TorchClassifier(), param_grid, cv=3, scoring="r2", n_jobs=-1)
grid.fit(X_train_clf, y_train_clf)

print("Meilleurs paramètres :", grid.best_params_)
print("Score :", grid.best_score_)

In [None]:
model_clf = ClassificationModel(X_train_reg.shape[1], 2).to(device)

optimizer = torch.optim.Adam(model_clf.parameters(), lr=1e-3)

In [None]:
train(model_clf, training_data_clf, optimizer, epochs=50)


In [None]:
loss, metrics = evaluate(model_clf, testing_data_clf)
print("Final Evaluation:")
print(f"Loss: {loss:.4f}")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")