In [3]:
import os
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import optuna
from IPython.display import display

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {DEVICE}")

# Carga de datos
train_df = pd.read_parquet('../../Datos/df_train.parquet')
test_df  = pd.read_parquet('../../Datos/df_test.parquet')
display(train_df.head(), train_df.shape)
display(test_df.head(), test_df.shape)

# División en train/val
df_train, df_val = train_test_split(train_df, test_size=0.2, random_state=SEED)

lengths = train_df.groupby('id_bar').size().reset_index(name='length')
min_len = int(lengths['length'].quantile(0.05))
max_len = int(lengths['length'].quantile(0.95))

Usando dispositivo: cuda


Unnamed: 0,id,id_bar,anio,semana,ESTRATO,area_barrio,dengue,concentraciones,vivienda,equipesado,sumideros,maquina,lluvia_mean,lluvia_var,lluvia_max,lluvia_min,temperatura_mean,temperatura_var,temperatura_max,temperatura_min
0,4_2015_01,4,2015,1,3.0,0.56,0.0,0.0,0.0,0.0,0.0,0.0,0.000651,4.1e-05,0.0625,0.0,26.163889,11.588928,31.8,20.9
1,5_2015_01,5,2015,1,3.0,0.842,0.0,0.0,0.0,0.0,0.0,0.0,0.000651,4.1e-05,0.0625,0.0,26.163889,11.588928,31.8,20.9
2,3_2015_01,3,2015,1,1.0,0.781,0.0,0.0,0.0,0.0,0.0,0.0,0.000651,4.1e-05,0.0625,0.0,26.163889,11.588928,31.8,20.9
3,8_2015_01,8,2015,1,2.0,0.394,0.0,0.0,0.0,0.0,0.0,0.0,0.000651,4.1e-05,0.0625,0.0,26.163889,11.588928,31.8,20.9
4,9_2015_01,9,2015,1,2.0,0.292,0.0,0.0,0.0,0.0,0.0,0.0,0.000651,4.1e-05,0.0625,0.0,26.163889,11.588928,31.8,20.9


(3680, 20)

Unnamed: 0,id,id_bar,anio,semana,ESTRATO,area_barrio,concentraciones,vivienda,equipesado,sumideros,maquina,lluvia_mean,lluvia_var,lluvia_max,lluvia_min,temperatura_mean,temperatura_var,temperatura_max,temperatura_min
3680,4_2022_01,4,2022,1,3.0,0.56,0.0,0.0,0.0,0.0,0.0,0.24358,0.413774,3.842857,0.0,24.257738,8.391759,31.257143,20.814286
3681,5_2022_01,5,2022,1,3.0,0.842,0.0,0.0,0.0,0.0,0.0,0.24358,0.413774,3.842857,0.0,24.257738,8.391759,31.257143,20.814286
3682,3_2022_01,3,2022,1,1.0,0.781,0.0,0.0,0.0,0.0,0.0,0.24358,0.413774,3.842857,0.0,24.257738,8.391759,31.257143,20.814286
3683,8_2022_01,8,2022,1,2.0,0.394,0.0,0.0,0.0,0.0,0.0,0.24358,0.413774,3.842857,0.0,24.257738,8.391759,31.257143,20.814286
3684,9_2022_01,9,2022,1,2.0,0.292,0.0,0.0,0.0,0.0,0.0,0.24358,0.413774,3.842857,0.0,24.257738,8.391759,31.257143,20.814286


(520, 19)

In [4]:
class DengueDataset(Dataset):
    def __init__(self, df, seq_len, scaler=None, fit_scaler=False):
        self.seq_len = seq_len
        if fit_scaler:
            self.scaler = StandardScaler().fit(df[['dengue']])
        else:
            self.scaler = scaler
        self.data = []
        for _, grp in df.groupby('id_bar'):
            vals = self.scaler.transform(grp[['dengue']].values)
            for i in range(len(vals) - seq_len):
                x = vals[i:i+seq_len].astype(np.float32)
                y = vals[i+seq_len].astype(np.float32)
                self.data.append((x, y))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

def get_loader(df, seq_len, batch_size, shuffle, scaler=None, fit_scaler=False):
    ds = DengueDataset(df, seq_len, scaler=scaler, fit_scaler=fit_scaler)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle), ds.scaler

In [None]:
class GRUModel(nn.Module):
    def __init__(self, hidden_dim, num_layers, dropout):
        super().__init__()
        self.gru = nn.GRU(1, hidden_dim, num_layers, dropout if num_layers>1 else 0, batch_first=True)
        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.fc = nn.Linear(hidden_dim, 1)
    def forward(self, x):
        out, _ = self.gru(x)
        h = out[:, -1, :]
        hn = self.layer_norm(h)
        return self.fc(hn)

In [5]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total = 0
    for x, y in loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        pred = model(x.unsqueeze(-1))
        loss = criterion(pred.squeeze(), y.squeeze())
        loss.backward()
        optimizer.step()
        total += loss.item() * x.size(0)
    return total / len(loader.dataset)

def validate(model, loader, criterion):
    model.eval()
    total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            pred = model(x.unsqueeze(-1))
            loss = criterion(pred.squeeze(), y.squeeze())
            total += loss.item() * x.size(0)
    return total / len(loader.dataset)

In [6]:
# Optimización de hiperparámetros con validación
def objective(trial):
    seq_len = trial.suggest_int('seq_len', min_len, max_len)
    hd      = trial.suggest_int('hidden_dim', 16, 128)
    nl      = trial.suggest_int('num_layers', 1, 3)
    do      = trial.suggest_float('dropout', 0.0, 0.5)
    lr      = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    bs      = trial.suggest_int('batch_size', 16, 128, 16)

    train_loader, scaler = get_loader(df_train, seq_len, bs, shuffle=True, fit_scaler=True)
    val_loader, _        = get_loader(df_val,   seq_len, bs, shuffle=False, scaler=scaler)

    model = GRUModel(hd, nl, do).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for _ in range(10):
        train_epoch(model, train_loader, criterion, optimizer)
    return validate(model, val_loader, criterion)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
best = study.best_params
print(f"Mejores hiperparámetros: {best}")

In [None]:
# Entrenamiento final sobre df_train completo
full_loader, scaler = get_loader(train_df, best['seq_len'], best['batch_size'], shuffle=True, fit_scaler=True)
model = GRUModel(best['hidden_dim'], best['num_layers'], best['dropout']).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=best['lr'])
criterion = nn.MSELoss()
for e in range(1, 51):
    loss = train_epoch(model, full_loader, criterion, optimizer)
    if e % 10 == 0:
        print(f"Época {e}/50, Loss: {loss:.4f}")

In [None]:
# Predicción sobre test
output = []
for bid, grp in test_df.groupby('id_bar'):
    history = train_df[train_df['id_bar'] == bid]['dengue'].values
    seq = scaler.transform(history.reshape(-1, 1)).flatten().tolist()
    for _ in grp.itertuples():
        window = np.array(seq[-best['seq_len']:]).reshape(1, best['seq_len'], 1).astype(np.float32)
        pred = model(torch.tensor(window).to(DEVICE)).cpu().item()
        seq.append(pred)
        output.append({'id_bar': bid, 'dengue': max(0, scaler.inverse_transform([[pred]])[0][0])})

sub = pd.DataFrame(output)
tpl = pd.read_csv('../../Datos/sample_submission.csv')
out = tpl.drop(columns=['dengue']).merge(sub, on='id_bar', how='left')
output_file = 'sample_submission_predictions.csv'
out.to_csv(output_file, index=False)
print(f"Predicciones guardadas en {output_file}")