In [None]:
# Importación de librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configuración de visualización
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Cargar los datos
df_train = pd.read_parquet('../../Datos/df_train.parquet')
df_test = pd.read_parquet('../../Datos/df_test.parquet')
sample_submission = pd.read_csv('../../Datos/sample_submission.csv')

print("Forma de los datos:")
print(f"Train: {df_train.shape}")
print(f"Test: {df_test.shape}")
print(f"Sample submission: {sample_submission.shape}")

# Información básica sobre los datasets
print("\nInformación del dataset de entrenamiento:")
print(df_train.info())
print("\nPrimeras filas del dataset de entrenamiento:")
print(df_train.head())

# Estadísticas descriptivas
print("\nEstadísticas descriptivas - Variables numéricas:")
print(df_train.describe())

# Análisis de valores faltantes
print("\nValores faltantes en train:")
print(df_train.isnull().sum())
print("\nValores faltantes en test:")
print(df_test.isnull().sum())

# Crear columna de fecha para análisis temporal
df_train['fecha'] = pd.to_datetime(df_train['anio'].astype(str) + '-W' + 
                                  df_train['semana'].astype(str).str.zfill(2) + '-1', 
                                  format='%Y-W%W-%w')
df_test['fecha'] = pd.to_datetime(df_test['anio'].astype(str) + '-W' + 
                                 df_test['semana'].astype(str).str.zfill(2) + '-1', 
                                 format='%Y-W%W-%w')

# Análisis por barrio
print(f"\nNúmero de barrios únicos: {df_train['id_bar'].nunique()}")
print(f"Rango de años en train: {df_train['anio'].min()} - {df_train['anio'].max()}")
print(f"Rango de años en test: {df_test['anio'].min()} - {df_test['anio'].max()}")

# Visualización de series de tiempo por barrio
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

# Seleccionar 4 barrios aleatorios para visualización
barrios_muestra = df_train['id_bar'].unique()[:4]

for i, barrio in enumerate(barrios_muestra):
    data_barrio = df_train[df_train['id_bar'] == barrio].sort_values('fecha')
    axes[i].plot(data_barrio['fecha'], data_barrio['dengue'], linewidth=2)
    axes[i].set_title(f'Casos de Dengue - Barrio {barrio}')
    axes[i].set_xlabel('Fecha')
    axes[i].set_ylabel('Casos de Dengue')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Análisis de estacionalidad
df_train['mes'] = df_train['fecha'].dt.month
casos_por_mes = df_train.groupby('mes')['dengue'].agg(['mean', 'std'])

plt.figure(figsize=(10, 6))
plt.errorbar(casos_por_mes.index, casos_por_mes['mean'], 
             yerr=casos_por_mes['std'], fmt='o-', capsize=5, capthick=2)
plt.xlabel('Mes')
plt.ylabel('Casos promedio de Dengue')
plt.title('Estacionalidad de casos de Dengue')
plt.xticks(range(1, 13))
plt.grid(True, alpha=0.3)
plt.show()

# Correlación entre variables
variables_numericas = ['dengue', 'ESTRATO', 'area_barrio', 'concentraciones', 
                      'vivienda', 'equipesado', 'sumideros', 'maquina',
                      'lluvia_mean', 'lluvia_var', 'lluvia_max', 'lluvia_min',
                      'temperatura_mean', 'temperatura_var', 'temperatura_max', 'temperatura_min']

correlation_matrix = df_train[variables_numericas].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Matriz de Correlación de Variables')
plt.tight_layout()
plt.show()

# Análisis de la variable objetivo
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(df_train['dengue'], bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Casos de Dengue')
plt.ylabel('Frecuencia')
plt.title('Distribución de casos de Dengue')

plt.subplot(1, 2, 2)
plt.hist(np.log1p(df_train['dengue']), bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('log(1 + Casos de Dengue)')
plt.ylabel('Frecuencia')
plt.title('Distribución logarítmica de casos de Dengue')
plt.tight_layout()
plt.show()

# Evolución temporal agregada
casos_semanales = df_train.groupby('fecha')['dengue'].sum().reset_index()

plt.figure(figsize=(14, 6))
plt.plot(casos_semanales['fecha'], casos_semanales['dengue'], linewidth=2)
plt.xlabel('Fecha')
plt.ylabel('Total de casos de Dengue')
plt.title('Evolución temporal del total de casos de Dengue')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import optuna
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

# Configurar dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

# Preparación de datos para series de tiempo
class DengueTimeSeriesDataset(Dataset):
    def __init__(self, data, sequence_length=12, target_length=1, is_test=False):
        self.sequence_length = sequence_length
        self.target_length = target_length
        self.is_test = is_test
        
        # Ordenar por barrio y fecha
        self.data = data.sort_values(['id_bar', 'fecha'])
        
        # Características a usar
        self.feature_cols = ['ESTRATO', 'area_barrio', 'concentraciones', 
                            'vivienda', 'equipesado', 'sumideros', 'maquina',
                            'lluvia_mean', 'lluvia_var', 'lluvia_max', 'lluvia_min',
                            'temperatura_mean', 'temperatura_var', 'temperatura_max', 
                            'temperatura_min', 'semana', 'mes']
        
        self.target_col = 'dengue'
        
        # Escaladores
        self.feature_scaler = StandardScaler()
        self.target_scaler = MinMaxScaler()
        # Crear secuencias por barrio
        self.sequences = []
        self.targets = []
        
        for barrio in self.data['id_bar'].unique():
            barrio_data = self.data[self.data['id_bar'] == barrio].copy()
            
            if len(barrio_data) >= sequence_length + target_length:
                # Escalar características
                features = self.feature_scaler.fit_transform(barrio_data[self.feature_cols])
                if not self.is_test:
                    targets = self.target_scaler.fit_transform(barrio_data[[self.target_col]])
                else:
                    # Para datos de prueba, crear targets ficticios de ceros
                    targets = np.zeros((len(barrio_data), 1))
                targets = self.target_scaler.fit_transform(barrio_data[[self.target_col]])
                
                # Crear secuencias
                for i in range(len(barrio_data) - sequence_length - target_length + 1):
                    seq = features[i:i+sequence_length]
                    target = targets[i+sequence_length:i+sequence_length+target_length]
                    
                    self.sequences.append(seq)
                    self.targets.append(target)
        
        self.sequences = np.array(self.sequences)
        self.targets = np.array(self.targets)
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return (torch.FloatTensor(self.sequences[idx]), 
                torch.FloatTensor(self.targets[idx]))

# Modelo ConvLSTM
class ConvLSTMCell(nn.Module):
    def __init__(self, input_dim, hidden_dim, kernel_size, bias=True):
        super(ConvLSTMCell, self).__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.kernel_size = kernel_size
        self.padding = kernel_size // 2
        self.bias = bias
        
        self.conv = nn.Conv1d(in_channels=self.input_dim + self.hidden_dim,
                              out_channels=4 * self.hidden_dim,
                              kernel_size=self.kernel_size,
                              padding=self.padding,
                              bias=self.bias)
        
    def forward(self, input_tensor, cur_state):
        h_cur, c_cur = cur_state
        
        combined = torch.cat([input_tensor, h_cur], dim=1)
        combined_conv = self.conv(combined)
        
        cc_i, cc_f, cc_o, cc_g = torch.split(combined_conv, self.hidden_dim, dim=1)
        i = torch.sigmoid(cc_i)
        f = torch.sigmoid(cc_f)
        o = torch.sigmoid(cc_o)
        g = torch.tanh(cc_g)
        
        c_next = f * c_cur + i * g
        h_next = o * torch.tanh(c_next)
        
        return h_next, c_next
    
    def init_hidden(self, batch_size, length):
        return (torch.zeros(batch_size, self.hidden_dim, length, device=self.conv.weight.device),
                torch.zeros(batch_size, self.hidden_dim, length, device=self.conv.weight.device))

class ConvLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dims, kernel_sizes, num_layers, 
                 batch_first=True, bias=True, return_all_layers=False):
        super(ConvLSTM, self).__init__()
        
        self._check_kernel_size_consistency(kernel_sizes)
        
        kernel_sizes = self._extend_for_multilayer(kernel_sizes, num_layers)
        hidden_dims = self._extend_for_multilayer(hidden_dims, num_layers)
        
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.kernel_sizes = kernel_sizes
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.bias = bias
        self.return_all_layers = return_all_layers
        
        cell_list = []
        for i in range(0, self.num_layers):
            cur_input_dim = self.input_dim if i == 0 else self.hidden_dims[i - 1]
            
            cell_list.append(ConvLSTMCell(input_dim=cur_input_dim,
                                         hidden_dim=self.hidden_dims[i],
                                         kernel_size=self.kernel_sizes[i],
                                         bias=self.bias))
            
        self.cell_list = nn.ModuleList(cell_list)
        
    def forward(self, input_tensor, hidden_state=None):
        if not self.batch_first:
            input_tensor = input_tensor.permute(1, 0, 2, 3)
            
        batch_size, seq_len, channels, length = input_tensor.size()
        
        if hidden_state is None:
            hidden_state = self._init_hidden(batch_size, length)
            
        layer_output_list = []
        last_state_list = []
        
        cur_layer_input = input_tensor
        
        for layer_idx in range(self.num_layers):
            h, c = hidden_state[layer_idx]
            output_inner = []
            
            for t in range(seq_len):
                h, c = self.cell_list[layer_idx](input_tensor=cur_layer_input[:, t, :, :],
                                                 cur_state=[h, c])
                output_inner.append(h)
                
            layer_output = torch.stack(output_inner, dim=1)
            cur_layer_input = layer_output
            
            layer_output_list.append(layer_output)
            last_state_list.append([h, c])
            
        if self.return_all_layers:
            return layer_output_list, last_state_list
        else:
            return layer_output_list[-1], last_state_list
    
    def _init_hidden(self, batch_size, length):
        init_states = []
        for i in range(self.num_layers):
            init_states.append(self.cell_list[i].init_hidden(batch_size, length))
        return init_states
    
    @staticmethod
    def _check_kernel_size_consistency(kernel_sizes):
        if not (isinstance(kernel_sizes, tuple) or
                (isinstance(kernel_sizes, list) and all([isinstance(elem, tuple) for elem in kernel_sizes]))):
            raise ValueError('`kernel_sizes` must be tuple or list of tuples')
            
    @staticmethod
    def _extend_for_multilayer(param, num_layers):
        if not isinstance(param, list):
            param = [param] * num_layers
        return param

# Modelo completo
class DenguePredictor(nn.Module):
    def __init__(self, input_dim, hidden_dims, kernel_sizes, num_layers, dropout=0.2):
        super(DenguePredictor, self).__init__()
        
        # Reshape input para ConvLSTM (añadir dimensión espacial)
        self.input_reshape = nn.Unflatten(2, (input_dim, 1))
        
        # ConvLSTM
        self.convlstm = ConvLSTM(input_dim, hidden_dims, kernel_sizes, 
                                 num_layers, batch_first=True)
        
        # Capas finales
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_dims[-1], 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        # x shape: (batch, seq_len, features)
        batch_size, seq_len, features = x.shape
        
        # Reshape para ConvLSTM: (batch, seq_len, channels, length)
        x = x.unsqueeze(-1)  # (batch, seq_len, features, 1)
        
        # ConvLSTM
        lstm_out, _ = self.convlstm(x)
        
        # Tomar la última salida temporal
        lstm_out = lstm_out[:, -1, :, :]  # (batch, hidden_dim, 1)
        lstm_out = lstm_out.squeeze(-1)    # (batch, hidden_dim)
        
        # Capas densas
        x = self.dropout(lstm_out)
        x = self.activation(self.fc1(x))
        x = self.dropout(x)
        x = self.activation(self.fc2(x))
        x = self.fc3(x)
        
        return x

# Función objetivo para Optuna
def objective(trial):
    # Hiperparámetros a optimizar
    hidden_dim1 = trial.suggest_int('hidden_dim1', 32, 128, step=32)
    hidden_dim2 = trial.suggest_int('hidden_dim2', 16, 64, step=16)
    kernel_size = trial.suggest_int('kernel_size', 3, 7, step=2)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    sequence_length = trial.suggest_int('sequence_length', 8, 24, step=4)
    
    # Crear dataset con los hiperparámetros
    train_dataset = DengueTimeSeriesDataset(df_train, sequence_length=sequence_length, is_test=False)
    val_dataset = DengueTimeSeriesDataset(df_test, sequence_length=sequence_length, is_test=True)
    
    # Crear dataset con los hiperparámetros
    train_dataset = DengueTimeSeriesDataset(df_train, sequence_length=sequence_length)
    val_dataset = DengueTimeSeriesDataset(df_test, sequence_length=sequence_length)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Crear modelo
    model = DenguePredictor(
        input_dim=len(train_dataset.feature_cols),
        hidden_dims=[hidden_dim1, hidden_dim2][:num_layers],
        kernel_sizes=(kernel_size,),
        num_layers=num_layers,
        dropout=dropout
    ).to(device)
    
    # Configurar entrenamiento
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    
    # Entrenar
    n_epochs = 50
    early_stopping_patience = 10
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(n_epochs):
        # Training
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred.squeeze(), y_batch.squeeze())
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch)
                loss = criterion(y_pred.squeeze(), y_batch.squeeze())
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        scheduler.step(val_loss)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= early_stopping_patience:
            break
            
        # Reportar a Optuna
        trial.report(val_loss, epoch)
        
        # Pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return best_val_loss

# Añadir columna 'mes' a df_test
df_test['mes'] = df_test['fecha'].dt.month

# Ejecutar optimización
print("Iniciando optimización de hiperparámetros con Optuna...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30, timeout=3600)  # 1 hora máximo

print("Mejores hiperparámetros encontrados:")
print(study.best_params)
print(f"Mejor valor de pérdida: {study.best_value}")

# Guardar el estudio
with open('optuna_study.pkl', 'wb') as f:
    pickle.dump(study, f)

In [None]:
# Combinar todos los datos para el entrenamiento final
df_all = pd.concat([df_train, df_test], ignore_index=True)
df_all = df_all.sort_values(['id_bar', 'fecha'])

# Usar los mejores hiperparámetros
best_params = study.best_params
print("Entrenando modelo final con los mejores hiperparámetros...")

# Crear dataset con todos los datos
final_dataset = DengueTimeSeriesDataset(
    df_all, 
    sequence_length=best_params['sequence_length']
)

final_loader = DataLoader(
    final_dataset, 
    batch_size=best_params['batch_size'], 
    shuffle=True
)

# Crear modelo final
final_model = DenguePredictor(
    input_dim=len(final_dataset.feature_cols),
    hidden_dims=[best_params['hidden_dim1'], 
                 best_params.get('hidden_dim2', best_params['hidden_dim1']//2)][:best_params['num_layers']],
    kernel_sizes=(best_params['kernel_size'],),
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout']
).to(device)

# Configurar entrenamiento
criterion = nn.MSELoss()
optimizer = optim.Adam(final_model.parameters(), lr=best_params['lr'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.5)

# Entrenar modelo final
n_epochs = 100
train_losses = []

for epoch in range(n_epochs):
    final_model.train()
    epoch_loss = 0
    
    for X_batch, y_batch in final_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        y_pred = final_model(X_batch)
        loss = criterion(y_pred.squeeze(), y_batch.squeeze())
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(final_loader)
    train_losses.append(avg_loss)
    scheduler.step(avg_loss)
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {avg_loss:.6f}")

# Guardar modelo final
torch.save({
    'model_state_dict': final_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'best_params': best_params,
    'feature_scaler': final_dataset.feature_scaler,
    'target_scaler': final_dataset.target_scaler,
    'feature_cols': final_dataset.feature_cols
}, 'final_dengue_model.pth')

# Visualizar pérdida de entrenamiento
plt.figure(figsize=(10, 6))
plt.plot(train_losses)
plt.xlabel('Época')
plt.ylabel('Pérdida MSE')
plt.title('Pérdida de entrenamiento del modelo final')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Función para generar predicciones
def generate_predictions_2022(model, df_historical, scalers, sequence_length, feature_cols):
    predictions = []
    ids = []
    
    # Procesar cada barrio
    for barrio in df_historical['id_bar'].unique():
        barrio_data = df_historical[df_historical['id_bar'] == barrio].copy()
        barrio_data = barrio_data.sort_values('fecha')
        
        # Tomar las últimas sequence_length semanas como contexto inicial
        if len(barrio_data) >= sequence_length:
            # Preparar features
            context_data = barrio_data.tail(sequence_length).copy()
            
            # Predecir para cada semana del 2022
            for week in range(1, 53):  # 52 semanas
                # Crear features para la predicción
                features = context_data[feature_cols].values
                features_scaled = scalers['feature_scaler'].transform(features)
                
                # Convertir a tensor
                X = torch.FloatTensor(features_scaled).unsqueeze(0).to(device)
                
                # Predecir
                model.eval()
                with torch.no_grad():
                    y_pred_scaled = model(X).cpu().numpy()
                
                # Desescalar predicción
                y_pred = scalers['target_scaler'].inverse_transform(y_pred_scaled.reshape(-1, 1))
                y_pred = max(0, y_pred[0, 0])  # Asegurar no negativos
                
                # Guardar predicción
                id_str = f"{barrio}_2022_{week:02d}"
                ids.append(id_str)
                predictions.append(y_pred)
                
                # Actualizar contexto para la siguiente predicción
                # Crear nueva fila con la predicción
                new_row = context_data.iloc[-1:].copy()
                new_row['dengue'] = y_pred
                new_row['semana'] = week
                new_row['fecha'] = pd.to_datetime(f'2022-W{week:02d}-1', format='%Y-W%W-%w')
                new_row['mes'] = new_row['fecha'].dt.month
                
                # Actualizar contexto deslizante
                context_data = pd.concat([context_data.iloc[1:], new_row], ignore_index=True)
    
    return ids, predictions

# Cargar modelo y escaladores
checkpoint = torch.load('final_dengue_model.pth')
final_model.load_state_dict(checkpoint['model_state_dict'])
final_model.eval()

scalers = {
    'feature_scaler': checkpoint['feature_scaler'],
    'target_scaler': checkpoint['target_scaler']
}

# Generar predicciones
print("Generando predicciones para 2022...")
pred_ids, pred_values = generate_predictions_2022(
    final_model, 
    df_all, 
    scalers,
    best_params['sequence_length'],
    checkpoint['feature_cols']
)

# Crear dataframe de submission
submission_df = pd.DataFrame({
    'id': pred_ids,
    'dengue': pred_values
})

# Verificar que coincida con el formato esperado
print(f"Predicciones generadas: {len(submission_df)}")
print(f"Formato esperado: {len(sample_submission)}")

# Asegurar que el orden coincida con sample_submission
submission_df = submission_df.set_index('id').loc[sample_submission['id']].reset_index()

# Guardar submission
submission_df.to_csv('submission_convlstm.csv', index=False)
print("Archivo de submission guardado como 'submission_convlstm.csv'")

# Visualizar algunas predicciones
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

barrios_muestra = df_all['id_bar'].unique()[:4]

for i, barrio in enumerate(barrios_muestra):
    # Datos históricos
    hist_data = df_all[df_all['id_bar'] == barrio].sort_values('fecha')
    
    # Predicciones 2022
    pred_data = submission_df[submission_df['id'].str.startswith(f"{barrio}_2022")]
    pred_data['fecha'] = pd.to_datetime('2022') + pd.to_timedelta(
        pred_data['id'].str.extract(r'_(\d+)$')[0].astype(int) * 7, unit='D'
    )
    
    # Graficar
    axes[i].plot(hist_data['fecha'], hist_data['dengue'], 
                 label='Histórico', linewidth=2)
    axes[i].plot(pred_data['fecha'], pred_data['dengue'], 
                 label='Predicción 2022', linewidth=2, linestyle='--', color='red')
    axes[i].set_title(f'Barrio {barrio}')
    axes[i].set_xlabel('Fecha')
    axes[i].set_ylabel('Casos de Dengue')
    axes[i].legend()
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Estadísticas de las predicciones
print("\nEstadísticas de las predicciones:")
print(submission_df['dengue'].describe())

# Comparar con las estadísticas históricas
print("\nEstadísticas históricas (train):")
print(df_train['dengue'].describe())