In [1]:
import sklearn.metrics
from torch.utils.data import TensorDataset
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

from src.torch.torch_utils import *
#from src.torch.torch_model import *
from src.torch.torch_wrapper import *

In [2]:
df = pd.read_csv('../data/health_lifestyle_dataset_cleaned.csv')

In [3]:
df

Unnamed: 0,age,bmi,daily_steps,sleep_hours,water_intake_l,calories_consumed,smoker,alcohol,resting_hr,systolic_bp,diastolic_bp,cholesterol,family_history,disease_risk,gender_Female,gender_Male
0,0.417853,-1.341930,-1.145573,-1.281848,0.499875,-1.240382,-0.501468,-0.654685,1.562891,1.018033,1.238900,240,-0.653329,0,0.0,1.0
1,1.144651,0.672982,0.707405,1.240517,1.501933,-0.318774,-0.501468,1.527452,-0.447697,-0.715978,-1.412862,207,-0.653329,0,1.0,0.0
2,-0.141222,0.405377,-1.579777,0.053522,1.116526,-1.189594,-0.501468,1.527452,1.077577,-0.446243,0.547136,296,-0.653329,0,0.0,1.0
3,-0.923927,1.444316,0.965082,-1.430222,-0.579263,-0.177560,-0.501468,-0.654685,-0.239705,1.172167,0.316548,175,-0.653329,0,1.0,0.0
4,0.641484,0.720207,-0.810210,-1.331306,0.962363,1.427822,-0.501468,1.527452,1.632221,0.170294,-1.643450,294,-0.653329,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.250131,0.641499,-1.049286,-1.281848,-0.579263,0.637518,-0.501468,1.527452,-1.279664,-1.139847,-0.778745,282,-0.653329,0,0.0,1.0
99996,-1.483003,0.956329,0.195880,-0.985099,0.268631,-0.788249,-0.501468,-0.654685,-1.626317,0.555630,-0.721098,192,-0.653329,0,0.0,1.0
99997,-0.644390,-1.593794,-1.195540,-1.331306,-1.350076,-0.341071,-0.501468,-0.654685,-0.378366,-1.640783,1.584783,218,-0.653329,0,0.0,1.0
99998,1.312374,-0.192800,1.026721,-0.441060,-1.504239,0.606550,-0.501468,-0.654685,1.285568,1.133633,-1.009333,188,-0.653329,0,1.0,0.0


In [4]:
regression_target = 'cholesterol'
classification_target = 'disease_risk'
features = df.drop(columns=[regression_target, classification_target]).values
regression_labels = df[regression_target].values
classification_labels = df[classification_target].values

In [5]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    features, regression_labels, test_size=0.2, random_state=42
)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    features, classification_labels, test_size=0.2, random_state=42
)

# Regression

In [6]:
training_data_reg = DataLoader(
    TensorDataset(torch.FloatTensor(X_train_reg), torch.FloatTensor(y_train_reg)),
    batch_size=32,
    shuffle=True
)
testing_data_reg = DataLoader(
    TensorDataset(torch.FloatTensor(X_test_reg), torch.FloatTensor(y_test_reg)),
    batch_size=32,
    shuffle=False
)

In [None]:
param_grid = {
    "layers": [2, 3, 4],
    "width": [64, 128],
    "lr": [1e-2, 1e-3],
    "epochs": [20,],
    "loss_fn": [nn.MSELoss(), nn.HuberLoss()],
    "optimizer": [torch.optim.Adam, torch.optim.SGD],
    "activation": [nn.ReLU(), nn.Tanh()],
    "batch_size": [16, 32],
    "dropout_rates": [[0.0], [0.5, 0.2]],
}

grid = GridSearchCV(TorchRegressor(), param_grid, cv=3, scoring="r2", n_jobs=-1)
grid.fit(X_train_reg, y_train_reg)

print("Meilleurs paramètres :", grid.best_params_)
print("Score :", grid.best_score_)

In [None]:
model_reg = RegressionModel(X_train_reg.shape[1], loss_fn=nn.HuberLoss(), width=64, dropout_rates=[0.5, 0.2]).to(device)

optimizer = torch.optim.Adam(model_reg.parameters(), lr=1e-3)

In [None]:
hist = train(model_reg, training_data_reg, optimizer, epochs=30)

In [None]:
loss, metrics = evaluate(model_reg, testing_data_reg)
print("Final Evaluation:")
print(f"Loss: {loss:.4f}")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")

In [None]:
plt.plot(hist, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()

## History

On n'avait pas standadisé les données dcp les loss était énormes (de l'ordre de 1e36). Apres ca, elles sont entre 1900 et 2400.

```python
model_reg = RegressionModel(X_train_reg.shape[1], width=512, dropout=[0]).to(device)
```

Ensuite j'ai essayé avec des architectures plus petites (width=128, puis width=64) et avec du dropout (0.2, 0.5) mais les performances étaient moins bonnes.

Final Evaluation:
Loss: 1.0004
MAE: 0.8677
R2: -0.0003

# Classification

In [None]:
training_data_clf = DataLoader(
    TensorDataset(torch.FloatTensor(X_train_clf), torch.LongTensor(y_train_clf)),
    batch_size=32,
    shuffle=True
)

testing_data_clf = DataLoader(
    TensorDataset(torch.FloatTensor(X_test_clf), torch.LongTensor(y_test_clf)),
    batch_size=32,
    shuffle=False
)

In [None]:
model_clf = ClassificationModel(X_train_reg.shape[1], 2).to(device)

optimizer = torch.optim.Adam(model_clf.parameters(), lr=1e-3)

In [None]:
train(model_clf, training_data_clf, optimizer, epochs=50)


In [None]:
loss, metrics = evaluate(model_clf, testing_data_clf)
print("Final Evaluation:")
print(f"Loss: {loss:.4f}")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")