In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import optuna
from tqdm.notebook import tqdm
import warnings

# Ignorar advertencias de Optuna y otras
warnings.filterwarnings('ignore', category=optuna.exceptions.ExperimentalWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Configuración del dispositivo (GPU o CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

Usando dispositivo: cuda


In [2]:
# Carga de datos
df = pd.read_parquet('../../Datos/df_train.parquet')

df = df.drop(columns=['lluvia_min'])

# Crear columna de fecha a partir de año y semana ISO
# %G: Año ISO, %V: Semana ISO, %u: Día de la semana (1=Lunes)
df['fecha'] = pd.to_datetime(df['anio'].astype(str) + df['semana'].astype(str) + '1', format='%G%V%u')

# Establecer la fecha como índice y ordenar
df = df.set_index('fecha').sort_index()

print("Datos cargados y preparados:")
print(df.info())
print(df.head())

Datos cargados y preparados:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3680 entries, 2014-12-29 to 2022-01-03
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                3680 non-null   object 
 1   id_bar            3680 non-null   int64  
 2   anio              3680 non-null   int64  
 3   semana            3680 non-null   UInt32 
 4   ESTRATO           3680 non-null   float64
 5   area_barrio       3680 non-null   float64
 6   dengue            3680 non-null   float64
 7   concentraciones   3680 non-null   float64
 8   vivienda          3680 non-null   float64
 9   equipesado        3680 non-null   float64
 10  sumideros         3680 non-null   float64
 11  maquina           3680 non-null   float64
 12  lluvia_mean       3680 non-null   float64
 13  lluvia_var        3680 non-null   float64
 14  lluvia_max        3680 non-null   float64
 15  temperatura_mean  3680 non-null   float64


In [3]:
# --- División de Datos ---
# Creamos una copia para evitar advertencias de SettingWithCopyWarning
df_train_val = df[df['anio'] <= 2021].copy()
df_forecast_inputs = df.copy() # Copia completa para la fase de pronóstico

# --- Tratamiento de Variables Categóricas (para Embeddings) ---
# PRIMERO, convertimos las columnas a tipo 'category' en el DataFrame base
categorical_cols = ['id_bar', 'ESTRATO']
for col in categorical_cols:
    df_train_val[col] = df_train_val[col].astype('category')

# SEGUNDO, creamos los mapeos a partir del DataFrame con el tipo ya convertido
cat_mappings = {col: {cat: i for i, cat in enumerate(df_train_val[col].cat.categories)} for col in categorical_cols}

# TERCERO, ahora creamos los dataframes de entrenamiento y validación
# Estos heredarán el tipo 'category' de df_train_val
train_df = df_train_val[df_train_val['anio'] < 2021].copy()
val_df = df_train_val[df_train_val['anio'] == 2021].copy()

# AHORA SÍ, aplicamos .cat.codes a las columnas que ya son de tipo categórico
for col in categorical_cols:
    train_df[col] = train_df[col].cat.codes
    val_df[col] = val_df[col].cat.codes

print(f"Tamaño del set de Entrenamiento: {train_df.shape}")
print(f"Tamaño del set de Validación: {val_df.shape}")

# --- Escalado de Variables Numéricas ---
target_col = 'dengue'
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
# Quitar las columnas que no son covariables
for col in ['anio', 'semana'] + categorical_cols:
    if col in numerical_cols:
        numerical_cols.remove(col)
        
# El objetivo 'dengue' también se escala
numerical_features_to_scale = numerical_cols.copy()
if target_col not in numerical_features_to_scale:
    numerical_features_to_scale.append(target_col)

# Ajustar el escalador SÓLO con datos de entrenamiento
scaler = StandardScaler()
# Usamos .loc para asegurar la asignación correcta y evitar advertencias
train_df.loc[:, numerical_features_to_scale] = scaler.fit_transform(train_df[numerical_features_to_scale])

# Transformar los conjuntos de validación con el mismo escalador
val_df.loc[:, numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])


# --- Creación de Secuencias (Ventanas Deslizantes) ---
def create_sequences(data, window_size, target_col, categorical_cols, numerical_cols):
    sequences = []
    labels = []
    # Agrupar por cada serie temporal individual (cada barrio)
    for id_bar_code, group in data.groupby('id_bar'):
        # Usamos las columnas numéricas que ya fueron escaladas y las categóricas ya codificadas
        feature_data = group[categorical_cols + numerical_cols].values
        target_data = group[target_col].values
        
        for i in range(len(group) - window_size):
            sequences.append(feature_data[i:i + window_size])
            labels.append(target_data[i + window_size])
            
    return np.array(sequences), np.array(labels)

# Indices de las columnas para separar en el modelo
all_feature_cols = categorical_cols + numerical_cols
cat_indices = [i for i, col in enumerate(all_feature_cols) if col in categorical_cols]
num_indices = [i for i, col in enumerate(all_feature_cols) if col in numerical_cols]

print("\nPreprocesamiento completado.")
print(f"Columnas categóricas: {categorical_cols}")
print(f"Columnas numéricas escaladas: {numerical_features_to_scale}")

Tamaño del set de Entrenamiento: (3150, 19)
Tamaño del set de Validación: (530, 19)

Preprocesamiento completado.
Columnas categóricas: ['id_bar', 'ESTRATO']
Columnas numéricas escaladas: ['area_barrio', 'dengue', 'concentraciones', 'vivienda', 'equipesado', 'sumideros', 'maquina', 'lluvia_mean', 'lluvia_var', 'lluvia_max', 'temperatura_mean', 'temperatura_var', 'temperatura_max', 'temperatura_min']


In [4]:
class GRUModel(nn.Module):
    def __init__(self, embedding_sizes, n_continuous, hidden_size, num_layers, dropout_rate, output_size=1):
        super(GRUModel, self).__init__()
        
        # Capas de Embedding para variables categóricas
        self.embeddings = nn.ModuleList([nn.Embedding(num_categories, embed_dim) for num_categories, embed_dim in embedding_sizes])
        
        n_embedding_dims = sum(embed_dim for _, embed_dim in embedding_sizes)
        
        # Capa GRU
        self.gru = nn.GRU(
            input_size=n_embedding_dims + n_continuous,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_rate if num_layers > 1 else 0
        )
        
        self.dropout = nn.Dropout(dropout_rate)
        
        # Capa de salida
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x_cat, x_cont):
        # Procesar embeddings
        x_embedded = [embedding(x_cat[:, :, i]) for i, embedding in enumerate(self.embeddings)]
        x_embedded = torch.cat(x_embedded, dim=2)
        
        # Concatenar embeddings con características continuas
        x = torch.cat([x_embedded, x_cont], dim=2)
        
        # Pasar por la GRU
        # output shape: (batch_size, seq_len, hidden_size)
        # hidden shape: (num_layers, batch_size, hidden_size)
        output, _ = self.gru(x)
        
        # Tomar la salida del último paso de tiempo
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        
        # Pasar por la capa lineal final
        out = self.fc(last_output)
        return out

# Tamaños de los embeddings (regla general: min(50, num_categorias/2))
embedding_sizes = []
for col in categorical_cols:
    num_categories = len(cat_mappings[col])
    embed_dim = min(50, (num_categories + 1) // 2)
    embedding_sizes.append((num_categories, embed_dim))
    print(f"Embedding para '{col}': {num_categories} categorías, dim={embed_dim}")

n_continuous = len(numerical_cols)

Embedding para 'id_bar': 10 categorías, dim=5
Embedding para 'ESTRATO': 3 categorías, dim=2


In [5]:
def objective(trial):
    # --- Definir espacio de búsqueda de hiperparámetros ---
    window_size = trial.suggest_int('window_size', 8, 52)
    hidden_size = trial.suggest_categorical('hidden_size', [16, 32, 64, 128])
    num_layers = trial.suggest_int('num_layers', 1, 3)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])

    # --- Crear secuencias y DataLoaders para este trial ---
    all_cols = categorical_cols + numerical_cols
    X_train_seq, y_train_seq = create_sequences(train_df, window_size, target_col, categorical_cols, numerical_cols)
    X_val_seq, y_val_seq = create_sequences(val_df, window_size, target_col, categorical_cols, numerical_cols)

    # Convertir a tensores
    X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32).view(-1, 1)
    X_val_tensor = torch.tensor(X_val_seq, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val_seq, dtype=torch.float32).view(-1, 1)

    # Crear Datasets y DataLoaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # --- Instanciar y entrenar modelo ---
    model = GRUModel(embedding_sizes, n_continuous, hidden_size, num_layers, dropout_rate).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    # Bucle de entrenamiento
    num_epochs = 200
    for epoch in range(num_epochs):
        model.train()
        for sequences, labels in train_loader:
            sequences = sequences.to(device)
            labels = labels.to(device)
            
            x_cat = sequences[:, :, cat_indices].long()
            x_cont = sequences[:, :, num_indices].float()

            optimizer.zero_grad()
            outputs = model(x_cat, x_cont)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # --- Evaluación ---
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for sequences, labels in val_loader:
            sequences = sequences.to(device)
            labels = labels.to(device)
            
            x_cat = sequences[:, :, cat_indices].long()
            x_cont = sequences[:, :, num_indices].float()
            
            outputs = model(x_cat, x_cont)
            val_loss += criterion(outputs, labels).item()

    avg_val_loss = val_loss / len(val_loader)
    
    # Optuna puede usar este valor para podar trials no prometedores (pruning)
    trial.report(avg_val_loss, epoch)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
        
    return avg_val_loss



In [6]:
# --- Ejecutar el estudio de Optuna ---
study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=200, show_progress_bar=True)

print("\nOptimización de hiperparámetros completada.")
print("Mejores hiperparámetros encontrados:")
best_params = study.best_params
print(best_params)

[I 2025-06-23 15:01:01,199] A new study created in memory with name: no-name-1d15bda6-0cf4-4724-baab-de318e7afac6


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-06-23 15:01:22,573] Trial 0 finished with value: 0.08618620907266934 and parameters: {'window_size': 37, 'hidden_size': 32, 'num_layers': 2, 'dropout_rate': 0.44365750535906623, 'learning_rate': 0.007306199442475917, 'batch_size': 64}. Best is trial 0 with value: 0.08618620907266934.
[I 2025-06-23 15:02:44,309] Trial 1 finished with value: 0.2502003490924835 and parameters: {'window_size': 14, 'hidden_size': 64, 'num_layers': 2, 'dropout_rate': 0.18129529945655665, 'learning_rate': 0.00012652675320423551, 'batch_size': 16}. Best is trial 0 with value: 0.08618620907266934.
[I 2025-06-23 15:03:18,827] Trial 2 finished with value: 0.16794285391058242 and parameters: {'window_size': 32, 'hidden_size': 16, 'num_layers': 1, 'dropout_rate': 0.24797569606260328, 'learning_rate': 0.00626973198836374, 'batch_size': 32}. Best is trial 0 with value: 0.08618620907266934.
[I 2025-06-23 15:03:58,853] Trial 3 finished with value: 0.1057263407856226 and parameters: {'window_size': 30, 'hidden_s

In [7]:
# --- Preparar datos combinados (Train + Val) ---
final_train_df = df_train_val.copy()

# Aplicar mapeos categóricos
for col, mapping in cat_mappings.items():
    final_train_df[col] = final_train_df[col].cat.codes

# Re-escalar variables numéricas con todos los datos (2018-2021)
final_scaler = StandardScaler()
final_scaler.fit(final_train_df[numerical_features_to_scale])
final_train_df.loc[:, numerical_features_to_scale] = final_scaler.transform(final_train_df[numerical_features_to_scale])

# --- Crear secuencias con los mejores parámetros ---
best_window_size = best_params['window_size']
X_final_train_seq, y_final_train_seq = create_sequences(final_train_df, best_window_size, target_col, categorical_cols, numerical_cols)

X_final_train_tensor = torch.tensor(X_final_train_seq, dtype=torch.float32)
y_final_train_tensor = torch.tensor(y_final_train_seq, dtype=torch.float32).view(-1, 1)

# --- DataLoader final ---
final_train_dataset = TensorDataset(X_final_train_tensor, y_final_train_tensor)
final_train_loader = DataLoader(final_train_dataset, batch_size=best_params['batch_size'], shuffle=True)

# --- Instanciar y entrenar el modelo final ---
final_model = GRUModel(
    embedding_sizes=embedding_sizes,
    n_continuous=n_continuous,
    hidden_size=best_params['hidden_size'],
    num_layers=best_params['num_layers'],
    dropout_rate=best_params['dropout_rate']
).to(device)

optimizer = optim.Adam(final_model.parameters(), lr=best_params['learning_rate'])
criterion = nn.MSELoss()

# Bucle de entrenamiento final
num_epochs_final = 200 # Entrenar por más épocas en el modelo final
train_losses = []

print("\nIniciando entrenamiento del modelo final...")
for epoch in tqdm(range(num_epochs_final)):
    final_model.train()
    epoch_loss = 0
    for sequences, labels in final_train_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        
        x_cat = sequences[:, :, cat_indices].long()
        x_cont = sequences[:, :, num_indices].float()

        optimizer.zero_grad()
        outputs = final_model(x_cat, x_cont)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    avg_epoch_loss = epoch_loss / len(final_train_loader)
    train_losses.append(avg_epoch_loss)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs_final}], Loss: {avg_epoch_loss:.6f}')

print("Entrenamiento final completado.")

# --- Graficar curva de pérdida ---
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(1, num_epochs_final + 1)), 
    y=train_losses, 
    mode='lines',
    name='Pérdida de Entrenamiento (MSE)'
))
fig.update_layout(
    title='Curva de Pérdida del Modelo Final',
    xaxis_title='Época',
    yaxis_title='Error Cuadrático Medio (MSE)',
    template='plotly_white'
)
fig.show()


Iniciando entrenamiento del modelo final...


  0%|          | 0/200 [00:00<?, ?it/s]

Epoch [10/200], Loss: 0.394250
Epoch [20/200], Loss: 0.334029
Epoch [30/200], Loss: 0.309835
Epoch [40/200], Loss: 0.314164
Epoch [50/200], Loss: 0.300228
Epoch [60/200], Loss: 0.290639
Epoch [70/200], Loss: 0.294850
Epoch [80/200], Loss: 0.278476
Epoch [90/200], Loss: 0.272100
Epoch [100/200], Loss: 0.270139
Epoch [110/200], Loss: 0.265394
Epoch [120/200], Loss: 0.261844
Epoch [130/200], Loss: 0.254873
Epoch [140/200], Loss: 0.247764
Epoch [150/200], Loss: 0.233945
Epoch [160/200], Loss: 0.234790
Epoch [170/200], Loss: 0.232264
Epoch [180/200], Loss: 0.221984
Epoch [190/200], Loss: 0.227818
Epoch [200/200], Loss: 0.223093
Entrenamiento final completado.


In [8]:
print("\nIniciando generación de pronósticos autorregresivos para 2022...")

# Preparar el dataframe de entrada para el pronóstico
df_forecast_proc = df_forecast_inputs.copy()
for col, mapping in cat_mappings.items():
    # Asegurarse de que el tipo sea category antes de usar .cat.codes
    df_forecast_proc[col] = pd.Categorical(df_forecast_proc[col], categories=mapping.keys())
    df_forecast_proc[col] = df_forecast_proc[col].cat.codes
    
df_forecast_proc.loc[:, numerical_features_to_scale] = final_scaler.transform(df_forecast_proc[numerical_features_to_scale])

# --- **CORRECCIÓN**: Crear un mapeo inverso para obtener las categorías originales a partir de los códigos ---
reverse_cat_mappings = {
    col: {code: cat for cat, code in mapping.items()}
    for col, mapping in cat_mappings.items()
}

predictions = []
final_model.eval()

# Bucle por cada barrio. El 'id_bar_code' que devuelve groupby es el código numérico (0, 1, 2...)
for id_bar_code, group in tqdm(df_forecast_proc.groupby('id_bar'), desc="Pronosticando por Barrio"):
    
    # Obtener la última secuencia de datos reales (final de 2021)
    history = group[group['anio'] == 2021].iloc[-best_window_size:]
    
    # Bucle por cada semana de 2022
    for week in range(1, 53):
        
        # Preparar la secuencia de entrada
        input_seq_df = history[categorical_cols + numerical_cols]
        input_seq_tensor = torch.tensor([input_seq_df.values], dtype=torch.float32).to(device)

        # Separar en categóricas y continuas
        x_cat = input_seq_tensor[:, :, cat_indices].long()
        x_cont = input_seq_tensor[:, :, num_indices].float()

        # Realizar la predicción
        with torch.no_grad():
            pred_scaled = final_model(x_cat, x_cont)

        # Crear un array temporal para des-escalar solo la predicción
        dummy_array = np.zeros((1, len(numerical_features_to_scale)))
        target_idx_in_scaler = numerical_features_to_scale.index(target_col)
        dummy_array[0, target_idx_in_scaler] = pred_scaled.item()
        
        # Des-escalar la predicción
        pred_descaled = final_scaler.inverse_transform(dummy_array)[0, target_idx_in_scaler]
        pred_descaled = max(0, int(round(pred_descaled))) # El dengue no puede ser negativo ni fraccionario

        # Guardar la predicción
        # --- **CORRECCIÓN**: Usar el mapeo inverso para una búsqueda directa ---
        id_bar_original = reverse_cat_mappings['id_bar'][id_bar_code]
        predictions.append({
            'id_bar': id_bar_original,
            'anio': 2022,
            'semana': week,
            'dengue': pred_descaled
        })

        # --- Actualización Autorregresiva ---
        # Obtener las covariables conocidas para la semana que estamos prediciendo
        next_step_features = group[(group['anio'] == 2022) & (group['semana'] == week)]
        if next_step_features.empty:
            # Si no hay datos para esa semana (poco probable), reutilizar la última conocida y actualizar la semana
            next_step_features = history.iloc[-1:].copy()
            next_step_features['semana'] = week
        else:
            next_step_features = next_step_features.copy()

        # Reemplazar el valor de 'dengue' con nuestra predicción escalada
        next_step_features.loc[:, target_col] = pred_scaled.item()

        # Añadir el nuevo paso de tiempo al historial y eliminar el más antiguo
        history = pd.concat([history.iloc[1:], next_step_features])

print("Pronósticos para 2022 generados.")


Iniciando generación de pronósticos autorregresivos para 2022...


Pronosticando por Barrio:   0%|          | 0/10 [00:00<?, ?it/s]


Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\utils\tensor_new.cpp:257.)



Pronósticos para 2022 generados.


In [9]:
from datetime import datetime

# Crear DataFrame con los resultados
df_results = pd.DataFrame(predictions)

# Crear la columna 'id' con el formato especificado
df_results['id'] = df_results.apply(
    lambda row: f"{row['id_bar']}_{row['anio']}_{row['semana']:02d}",
    axis=1
)

# Seleccionar y ordenar las columnas finales
df_submission = df_results[['id', 'dengue']]

# Guardar en archivo CSV
fecha_actual = datetime.now().strftime('%Y%m%d')
output_filename = f'pronosticos_dengue_2022_{fecha_actual}.csv'
df_submission.to_csv(output_filename, index=False)

print(f"\nArchivo de predicciones guardado como: '{output_filename}'")
print(df_submission.head())
print(f"Número total de predicciones: {len(df_submission)}")


Archivo de predicciones guardado como: 'pronosticos_dengue_2022_20250623.csv'
          id  dengue
0  0_2022_01       2
1  0_2022_02       2
2  0_2022_03       2
3  0_2022_04       2
4  0_2022_05       2
Número total de predicciones: 520
