In [None]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import optuna

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_df = pd.read_parquet('../../Datos/df_train.parquet')
test_df = pd.read_parquet('../../Datos/df_test.parquet')

lengths = train_df.groupby('id_bar').size()
min_len = int(lengths.quantile(0.05))
max_len = int(lengths.quantile(0.95))

bars = train_df['id_bar'].unique()
train_bars, val_bars = train_test_split(bars, test_size=0.2, random_state=SEED)
df_train = train_df[train_df['id_bar'].isin(train_bars)].reset_index(drop=True)
df_val = train_df[train_df['id_bar'].isin(val_bars)].reset_index(drop=True)

class DengueDataset(Dataset):
    def __init__(self, df, seq_len, scaler=None, fit_scaler=False):
        self.seq_len = seq_len
        self.scaler = StandardScaler().fit(df[['dengue']]) if fit_scaler else scaler
        self.data = []
        for _, grp in df.groupby('id_bar'):
            values = self.scaler.transform(grp[['dengue']].values)
            for i in range(len(values) - seq_len):
                self.data.append((values[i:i+seq_len].astype(np.float32), values[i+seq_len].astype(np.float32)))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        x, y = self.data[idx]
        return x, y

def get_loader(df, seq_len, batch_size, shuffle, scaler=None, fit_scaler=False):
    ds = DengueDataset(df, seq_len, scaler, fit_scaler)
    if len(ds) == 0:
        raise ValueError("empty dataset")
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle), ds.scaler

class GRUModel(nn.Module):
    def __init__(self, hidden_dim, num_layers, dropout):
        super().__init__()
        self.gru = nn.GRU(
            input_size=1,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers>1 else 0,
            batch_first=True
        )
        self.norm = nn.LayerNorm(hidden_dim)
        self.fc = nn.Linear(hidden_dim, 1)
    def forward(self, x):
        out, _ = self.gru(x)
        h = self.norm(out[:, -1, :])
        return self.fc(h)

def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total = 0
    for x, y in loader:
        x = x.unsqueeze(-1).to(DEVICE)
        y = y.to(DEVICE)
        optimizer.zero_grad()
        loss = criterion(model(x).squeeze(), y.squeeze())
        loss.backward()
        optimizer.step()
        total += loss.item() * x.size(0)
    return total / len(loader.dataset)

def validate(model, loader, criterion):
    model.eval()
    total = 0
    with torch.no_grad():
        for x, y in loader:
            x = x.unsqueeze(-1).to(DEVICE)
            y = y.to(DEVICE)
            total += criterion(model(x).squeeze(), y.squeeze()).item() * x.size(0)
    return total / len(loader.dataset)

def objective(trial):
    seq_len = trial.suggest_int('seq_len', min_len, max_len)
    hd = trial.suggest_int('hidden_dim', 16, 128)
    nl = trial.suggest_int('num_layers', 1, 3)
    do = trial.suggest_float('dropout', 0.0, 0.5)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    bs = trial.suggest_int('batch_size', 16, 128, 16)
    try:
        train_loader, scaler = get_loader(df_train, seq_len, bs, True, fit_scaler=True)
        val_loader, _ = get_loader(df_val, seq_len, bs, False, scaler)
    except ValueError:
        return float('inf')
    model = GRUModel(hd, nl, do).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    crit = nn.MSELoss()
    for _ in range(10):
        train_epoch(model, train_loader, crit, opt)
    return validate(model, val_loader, crit)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
best = study.best_params

train_loader, scaler = get_loader(df_train, best['seq_len'], best['batch_size'], True, fit_scaler=True)
model = GRUModel(best['hidden_dim'], best['num_layers'], best['dropout']).to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=best['lr'])
crit = nn.MSELoss()
for epoch in range(1, 51):
    loss = train_epoch(model, train_loader, crit, opt)
    if epoch % 10 == 0:
        print(epoch, loss)

predictions = []
for bar_id, grp in test_df.groupby('id_bar'):
    history = train_df[train_df['id_bar']==bar_id]['dengue'].values
    seq = scaler.transform(history.reshape(-1,1)).flatten().tolist()
    for _ in grp.itertuples():
        window = np.array(seq[-best['seq_len']:]).reshape(1, best['seq_len'], 1).astype(np.float32)
        p = model(torch.tensor(window).to(DEVICE)).cpu().item()
        seq.append(p)
        inv = scaler.inverse_transform([[p]])[0][0]
        predictions.append({'id_bar': bar_id, 'dengue': max(0, inv)})

sub = pd.read_csv('../../Datos/sample_submission.csv')
out = sub.drop(columns=['dengue']).merge(pd.DataFrame(predictions), on='id_bar', how='left')
out.to_csv('predictions.csv', index=False)


[I 2025-06-26 23:51:01,609] A new study created in memory with name: no-name-1ffa31c0-ab15-4f14-9471-da74519d041a
  bs = trial.suggest_int('batch_size', 16, 128, 16)
[I 2025-06-26 23:51:01,616] Trial 0 finished with value: inf and parameters: {'seq_len': 368, 'hidden_dim': 28, 'num_layers': 2, 'dropout': 0.10150396655160426, 'lr': 0.0006050451587709993, 'batch_size': 48}. Best is trial 0 with value: inf.
  bs = trial.suggest_int('batch_size', 16, 128, 16)
[I 2025-06-26 23:51:01,620] Trial 1 finished with value: inf and parameters: {'seq_len': 368, 'hidden_dim': 39, 'num_layers': 2, 'dropout': 0.014462779538141401, 'lr': 0.0021737320100735443, 'batch_size': 112}. Best is trial 0 with value: inf.
  bs = trial.suggest_int('batch_size', 16, 128, 16)
[I 2025-06-26 23:51:01,625] Trial 2 finished with value: inf and parameters: {'seq_len': 368, 'hidden_dim': 113, 'num_layers': 3, 'dropout': 0.012081071325605375, 'lr': 0.0009573628595355673, 'batch_size': 80}. Best is trial 0 with value: inf.


ValueError: empty dataset