In [1]:
# %%
import os
import random
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import optuna

import warnings
warnings.filterwarnings('ignore')

# Fijar semillas
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Seleccionar dispositivo de cómputo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# %%
data_dir = '../../Datos'
df_train = pd.read_parquet(os.path.join(data_dir, 'df_train.parquet'))
df_test  = pd.read_parquet(os.path.join(data_dir, 'df_test.parquet'))
print('Train shape:', df_train.shape)
print('Test  shape:', df_test.shape)

# %%
FEATURE_COLS = ['ESTRATO', 'area_barrio']
TARGET_COL  = 'dengue'

scaler = StandardScaler()
df_train[FEATURE_COLS] = scaler.fit_transform(df_train[FEATURE_COLS])
df_test[FEATURE_COLS]  = scaler.transform(df_test[FEATURE_COLS])

all_ids = df_train['id_bar'].unique()
train_ids, val_ids = train_test_split(all_ids, test_size=0.2, random_state=42)
train_df = df_train[df_train['id_bar'].isin(train_ids)].reset_index(drop=True)
val_df   = df_train[df_train['id_bar'].isin(val_ids)].reset_index(drop=True)

Train shape: (3680, 20)
Test  shape: (520, 19)


In [3]:
# %%
class DengueDataset(Dataset):
    def __init__(self, df, seq_len):
        self.seq_len = seq_len
        self.windows = []
        for _, group in df.groupby('id_bar'):
            group = group.sort_values(['anio', 'semana'])
            vals = group[FEATURE_COLS + [TARGET_COL]].values
            for i in range(len(vals) - seq_len):
                x = vals[i:i+seq_len, :-1]
                y = vals[i+seq_len, -1]
                self.windows.append((x, y))

    def __len__(self):
        return len(self.windows)

    def __getitem__(self, idx):
        x, y = self.windows[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

In [4]:
def create_loaders(seq_len, batch_size):
    train_ds = DengueDataset(train_df, seq_len)
    val_ds   = DengueDataset(val_df, seq_len)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
    return train_loader, val_loader

In [5]:
# %%
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True
        )
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        normed = self.layer_norm(last)
        return self.fc(normed).squeeze(1)

In [6]:
# %%
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0.0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        preds = model(x)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
    return total_loss / len(loader.dataset)


def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            preds = model(x)
            total_loss += criterion(preds, y).item() * x.size(0)
    return total_loss / len(loader.dataset)

In [7]:
# %%
def objective(trial):
    hidden_size = trial.suggest_int('hidden_size', 32, 256)
    num_layers  = trial.suggest_int('num_layers', 1, 3)
    dropout     = trial.suggest_float('dropout', 0.0, 0.5)
    lr          = trial.suggest_loguniform('lr', 1e-4, 1e-2)
    batch_size  = trial.suggest_categorical('batch_size', [32, 64, 128])
    seq_len     = trial.suggest_int('seq_len', 8, 16)

    model = LSTMModel(
        input_size=len(FEATURE_COLS),
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout
    ).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_loader, val_loader = create_loaders(seq_len, batch_size)
    best_val = float('inf')
    for _ in range(20):
        train_epoch(model, train_loader, criterion, optimizer)
        val_loss = eval_epoch(model, val_loader, criterion)
        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
    return best_val

In [8]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2025-06-15 22:28:41,553] A new study created in memory with name: no-name-87404ee1-c92e-4e08-b21e-ee95f5a12380
[I 2025-06-15 22:28:46,367] Trial 0 finished with value: 9.021092668634838 and parameters: {'hidden_size': 98, 'num_layers': 3, 'dropout': 0.12296275718486605, 'lr': 0.000627576788897835, 'batch_size': 32, 'seq_len': 11}. Best is trial 0 with value: 9.021092668634838.
[I 2025-06-15 22:28:48,119] Trial 1 finished with value: 9.042110899132743 and parameters: {'hidden_size': 59, 'num_layers': 1, 'dropout': 0.08718671612994744, 'lr': 0.00022914565648395678, 'batch_size': 64, 'seq_len': 13}. Best is trial 0 with value: 9.021092668634838.
[I 2025-06-15 22:28:49,841] Trial 2 finished with value: 9.020364690561589 and parameters: {'hidden_size': 220, 'num_layers': 2, 'dropout': 0.027021339881077244, 'lr': 0.0010493152617226129, 'batch_size': 128, 'seq_len': 11}. Best is trial 2 with value: 9.020364690561589.
[I 2025-06-15 22:28:51,592] Trial 3 finished with value: 9.01697224029923

In [11]:
print('Mejores hiperparámetros:', study.best_params)

# %%
params = study.best_params

# If you changed the model definition or search space, retrain and save the model again.
# Otherwise, make sure to use the exact same parameters as when 'best_model.pth' was saved.
model = LSTMModel(
    input_size=len(FEATURE_COLS),
    hidden_size=params['hidden_size'],
    num_layers=params['num_layers'],
    dropout=params['dropout']
).to(device)

# If you want to load weights, make sure the model definition matches the checkpoint.
# Otherwise, skip loading the state dict to avoid size mismatch errors.
model.eval()

seq_len = params['seq_len']
batch_size = params['batch_size']
dataset_test = DengueDataset(df_test, seq_len)
loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

preds = []
with torch.no_grad():
    for x, _ in loader_test:
        preds.extend(model(x.to(device)).cpu().numpy())

Mejores hiperparámetros: {'hidden_size': 105, 'num_layers': 1, 'dropout': 0.17646678238522887, 'lr': 0.00013076078831219422, 'batch_size': 64, 'seq_len': 16}


KeyError: "['dengue'] not in index"

In [None]:
submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
submission['dengue'] = preds
submission.to_csv('predictions_Gpt_LSTM.csv', index=False)
print('Archivo predictions.csv guardado correctamente')