In [None]:

from torch.utils.data import TensorDataset
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

from src.torch.torch_utils import *
#from src.torch.torch_model import *
from src.torch.torch_wrapper import *

In [None]:
df = pd.read_csv('../data/health_lifestyle_dataset_cleaned.csv')

In [None]:
df

In [None]:
regression_target = 'cholesterol'
classification_target = 'disease_risk'
features = df.drop(columns=[regression_target, classification_target]).values
regression_labels = df[regression_target].values
classification_labels = df[classification_target].values

In [None]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    features, regression_labels, test_size=0.2, random_state=42
)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    features, classification_labels, test_size=0.2, random_state=42
)

# Regression

In [None]:
training_data_reg = DataLoader(
    TensorDataset(torch.FloatTensor(X_train_reg), torch.FloatTensor(y_train_reg)),
    batch_size=32,
    shuffle=True
)
testing_data_reg = DataLoader(
    TensorDataset(torch.FloatTensor(X_test_reg), torch.FloatTensor(y_test_reg)),
    batch_size=32,
    shuffle=False
)

In [None]:
from sklearn.model_selection import GridSearchCV

reg = TorchRegressor()

param_grid = {
    "width": [64, 128, 256],
    "lr": [1e-2, 1e-3],
    "epochs": [20]
}

grid = GridSearchCV(reg, param_grid, cv=3, scoring="neg_mean_squared_error")
grid.fit(X_train_reg, y_train_reg)

print("Meilleurs paramètres :", grid.best_params_)
print("Score :", grid.best_score_)


In [None]:
param = {
    "width": [64, 128, 256],
    "lr": [1e-2, 1e-3],
    "epochs": [10, 50]
}

grid = GridSearchCV(
    TorchRegressor(),
    param_grid=param,
    cv=3)

grid.fit(X_train_reg, y_train_reg)

In [None]:
model_reg = RegressionModel(X_train_reg.shape[1], loss_fn=nn.HuberLoss(), width=64, dropout_rates=[0.5, 0.2]).to(device)

optimizer = torch.optim.Adam(model_reg.parameters(), lr=1e-3)

In [None]:
hist = train(model_reg, training_data_reg, optimizer, epochs=30)

In [None]:
loss, metrics = evaluate(model_reg, testing_data_reg)
print("Final Evaluation:")
print(f"Loss: {loss:.4f}")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")

In [None]:
plt.plot(hist, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()

## History

On n'avait pas standadisé les données dcp les loss était énormes (de l'ordre de 1e36). Apres ca, elles sont entre 1900 et 2400.

```python
model_reg = RegressionModel(X_train_reg.shape[1], width=512, dropout=[0]).to(device)
```

Ensuite j'ai essayé avec des architectures plus petites (width=128, puis width=64) et avec du dropout (0.2, 0.5) mais les performances étaient moins bonnes.

Final Evaluation:
Loss: 1.0004
MAE: 0.8677
R2: -0.0003

# Classification

In [None]:
training_data_clf = DataLoader(
    TensorDataset(torch.FloatTensor(X_train_clf), torch.LongTensor(y_train_clf)),
    batch_size=32,
    shuffle=True
)

testing_data_clf = DataLoader(
    TensorDataset(torch.FloatTensor(X_test_clf), torch.LongTensor(y_test_clf)),
    batch_size=32,
    shuffle=False
)

In [None]:
model_clf = ClassificationModel(X_train_reg.shape[1], 2).to(device)

optimizer = torch.optim.Adam(model_clf.parameters(), lr=1e-3)

In [None]:
train(model_clf, training_data_clf, optimizer, epochs=50)


In [None]:
loss, metrics = evaluate(model_clf, testing_data_clf)
print("Final Evaluation:")
print(f"Loss: {loss:.4f}")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")