In [None]:
# ============================================================
# NOTEBOOK 17: TRANSFORMER - VERSI√ìN SIMPLIFICADA (4 A√ëOS / 1 A√ëO)
# ============================================================
# OBJETIVO: Entrenar con 4 a√±os exactos, predecir 1 a√±o exacto
# CARACTER√çSTICAS: Simple, robusto, sin errores
# ============================================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# ============================================================
# CONFIGURACI√ìN
# ============================================================
print("="*60)
print("CONFIGURACI√ìN")
print("="*60)

# Hiperpar√°metros b√°sicos
LOOKBACK = 60      # D√≠as de historia para contexto
HORIZON = 7        # D√≠as a predecir
BATCH_SIZE = 16
EPOCHS = 100
LR = 0.001

# Split 4 a√±os / 1 a√±o
TRAIN_DAYS = 1460  # 4 a√±os
TEST_DAYS = 365    # 1 a√±o
TOTAL_DAYS = TRAIN_DAYS + TEST_DAYS

# Producto
PRODUCTO = "FOODS_3_090_CA_3_validation"

# Rutas
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")
print(f"lookback: {LOOKBACK}, horizon: {HORIZON}")
print(f"train: {TRAIN_DAYS} d√≠as, test: {TEST_DAYS} d√≠as")

# ============================================================
# CARGAR DATOS
# ============================================================
print("\n" + "="*60)
print("CARGANDO DATOS")
print("="*60)

df = pd.read_csv("../data/raw/sales_train_validation.csv")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['id', 'date']).reset_index(drop=True)

# Filtrar producto
df_prod = df[df['id'] == PRODUCTO].copy()
print(f"producto: {PRODUCTO}")
print(f"registros: {len(df_prod)}")
print(f"rango: {df_prod['date'].min().date()} ‚Üí {df_prod['date'].max().date()}")

# Verificar que tenemos suficientes datos
if len(df_prod) < TOTAL_DAYS:
    raise ValueError(f"Se necesitan {TOTAL_DAYS} d√≠as, pero solo hay {len(df_prod)}")

# ============================================================
# FEATURES (solo las esenciales)
# ============================================================
print("\n" + "="*60)
print("CREANDO FEATURES")
print("="*60)

# Crear features b√°sicas
df_prod['dow'] = df_prod['date'].dt.dayofweek
df_prod['month'] = df_prod['date'].dt.month
df_prod['is_weekend'] = (df_prod['dow'] >= 5).astype(int)

# Lags simples
df_prod['lag_7'] = df_prod['sales'].shift(7)
df_prod['lag_14'] = df_prod['sales'].shift(14)
df_prod['lag_28'] = df_prod['sales'].shift(28)

# Rellenar NaN
df_prod = df_prod.fillna(method='ffill').fillna(method='bfill').fillna(0)

# Features finales (NO incluir sales)
feature_cols = ['sell_price', 'snap', 'dow', 'month', 'is_weekend', 
                'lag_7', 'lag_14', 'lag_28']

print(f"features: {feature_cols}")

# ============================================================
# SPLIT 4 A√ëOS / 1 A√ëO (¬°CR√çTICO!)
# ============================================================
print("\n" + "="*60)
print("SPLIT 4 A√ëOS / 1 A√ëO")
print("="*60)

# Tomar los √∫ltimos TOTAL_DAYS d√≠as
start_idx = len(df_prod) - TOTAL_DAYS
df_final = df_prod.iloc[start_idx:].reset_index(drop=True)

# Split temporal ESTRICTO
train_data = df_final.iloc[:TRAIN_DAYS]
test_data = df_final.iloc[TRAIN_DAYS:]

print(f"TRAIN: {len(train_data)} d√≠as")
print(f"      {train_data['date'].min().date()} ‚Üí {train_data['date'].max().date()}")
print(f"TEST:  {len(test_data)} d√≠as")
print(f"      {test_data['date'].min().date()} ‚Üí {test_data['date'].max().date()}")

# ============================================================
# PREPARAR DATOS PARA TRANSFORMER
# ============================================================
print("\n" + "="*60)
print("PREPARANDO DATOS")
print("="*60)

# Escalar (fit SOLO en train)
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_raw = train_data[feature_cols].values
y_train_raw = train_data['sales'].values.reshape(-1, 1)
X_test_raw = test_data[feature_cols].values
y_test_raw = test_data['sales'].values.reshape(-1, 1)

scaler_X.fit(X_train_raw)
scaler_y.fit(y_train_raw)

X_train = scaler_X.transform(X_train_raw)
y_train = scaler_y.transform(y_train_raw).flatten()
X_test = scaler_X.transform(X_test_raw)
y_test = scaler_y.transform(y_test_raw).flatten()

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# ============================================================
# CREAR VENTANAS
# ============================================================
def crear_ventanas(X, y, lookback, horizon):
    X_w, y_w = [], []
    for i in range(len(X) - lookback - horizon + 1):
        X_w.append(X[i:i+lookback])
        y_w.append(y[i+lookback:i+lookback+horizon])
    return np.array(X_w), np.array(y_w)

# Ventanas de entrenamiento
X_train_w, y_train_w = crear_ventanas(X_train, y_train, LOOKBACK, HORIZON)

# Para test, queremos predecir todo el a√±o sin solapamiento
X_test_w, y_test_w = [], []
for i in range(0, len(X_test) - LOOKBACK - HORIZON + 1, HORIZON):
    X_test_w.append(X_test[i:i+LOOKBACK])
    y_test_w.append(y_test[i+LOOKBACK:i+LOOKBACK+HORIZON])
X_test_w = np.array(X_test_w)
y_test_w = np.array(y_test_w)

print(f"train windows: {X_train_w.shape}")
print(f"test windows:  {X_test_w.shape}")

# Dividir train en train/val (80/20)
val_size = int(0.2 * len(X_train_w))
X_val_w = X_train_w[-val_size:]
y_val_w = y_train_w[-val_size:]
X_train_w = X_train_w[:-val_size]
y_train_w = y_train_w[:-val_size]

print(f"final train: {X_train_w.shape[0]} windows")
print(f"final val:   {X_val_w.shape[0]} windows")
print(f"final test:  {X_test_w.shape[0]} windows")

# ============================================================
# DATALOADERS
# ============================================================
train_loader = DataLoader(
    TensorDataset(torch.FloatTensor(X_train_w), torch.FloatTensor(y_train_w)),
    batch_size=BATCH_SIZE, shuffle=True
)
val_loader = DataLoader(
    TensorDataset(torch.FloatTensor(X_val_w), torch.FloatTensor(y_val_w)),
    batch_size=BATCH_SIZE, shuffle=False
)
test_loader = DataLoader(
    TensorDataset(torch.FloatTensor(X_test_w), torch.FloatTensor(y_test_w)),
    batch_size=BATCH_SIZE, shuffle=False
)

# ============================================================
# MODELO TRANSFORMER SIMPLE
# ============================================================
print("\n" + "="*60)
print("CREANDO MODELO")
print("="*60)

class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=3, horizon=7):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_encoder = nn.Parameter(torch.randn(1, 5000, d_model) * 0.1)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, batch_first=True, dropout=0.1
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, horizon)
        
    def forward(self, x):
        # x: [batch, seq_len, features]
        x = self.input_proj(x)
        x = x + self.pos_encoder[:, :x.size(1), :]
        x = self.encoder(x)
        x = x[:, -1, :]  # √∫ltimo timestep
        return self.fc(x)

model = SimpleTransformer(
    input_dim=len(feature_cols),
    d_model=64,
    nhead=4,
    num_layers=3,
    horizon=HORIZON
).to(device)

print(f"par√°metros: {sum(p.numel() for p in model.parameters()):,}")

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ============================================================
# ENTRENAMIENTO
# ============================================================
print("\n" + "="*60)
print("ENTRENANDO")
print("="*60)

best_val_loss = float('inf')
train_losses, val_losses = [], []

for epoch in range(1, EPOCHS + 1):
    # Train
    model.train()
    train_loss = 0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        pred = model(Xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Val
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            pred = model(Xb)
            loss = criterion(pred, yb)
            val_loss += loss.item()
    
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_transformer.pth")
    
    if epoch % 10 == 0 or epoch == 1:
        print(f"epoch {epoch:3d}/{EPOCHS} | train loss: {train_loss:.6f} | val loss: {val_loss:.6f}")

print(f"\n‚úÖ mejor val loss: {best_val_loss:.6f}")

# ============================================================
# PREDICCIONES
# ============================================================
print("\n" + "="*60)
print("PREDICIENDO")
print("="*60)

model.load_state_dict(torch.load("best_transformer.pth"))
model.eval()

# Inferencia
train_preds, train_trues = [], []
test_preds, test_trues = [], []

with torch.no_grad():
    for Xb, yb in train_loader:
        pred = model(Xb.to(device)).cpu().numpy()
        train_preds.append(pred)
        train_trues.append(yb.numpy())
    for Xb, yb in test_loader:
        pred = model(Xb.to(device)).cpu().numpy()
        test_preds.append(pred)
        test_trues.append(yb.numpy())

train_preds = np.concatenate(train_preds)
train_trues = np.concatenate(train_trues)
test_preds = np.concatenate(test_preds)
test_trues = np.concatenate(test_trues)

# Desescalar
train_preds = scaler_y.inverse_transform(train_preds.reshape(-1, 1)).reshape(train_preds.shape)
train_trues = scaler_y.inverse_transform(train_trues.reshape(-1, 1)).reshape(train_trues.shape)
test_preds = scaler_y.inverse_transform(test_preds.reshape(-1, 1)).reshape(test_preds.shape)
test_trues = scaler_y.inverse_transform(test_trues.reshape(-1, 1)).reshape(test_trues.shape)

# No ventas negativas
train_preds = np.maximum(train_preds, 0)
test_preds = np.maximum(test_preds, 0)

# ============================================================
# M√âTRICAS
# ============================================================
print("\n" + "="*60)
print("M√âTRICAS")
print("="*60)

def metricas(y_true, y_pred, nombre):
    mae = mean_absolute_error(y_true.flatten(), y_pred.flatten())
    rmse = np.sqrt(mean_squared_error(y_true.flatten(), y_pred.flatten()))
    mask = y_true.flatten() > 0
    if mask.sum() > 0:
        mape = np.mean(np.abs((y_true.flatten()[mask] - y_pred.flatten()[mask]) / y_true.flatten()[mask])) * 100
    else:
        mape = np.nan
    print(f"{nombre}: MAE={mae:.2f}, RMSE={rmse:.2f}, MAPE={mape:.2f}%")
    return mae, rmse, mape

print("TRAIN:")
mae_tr, rmse_tr, mape_tr = metricas(train_trues, train_preds, "  ")

print("\nTEST:")
mae_te, rmse_te, mape_te = metricas(test_trues, test_preds, "  ")

print("\nPor horizonte (TEST):")
for h in range(HORIZON):
    mae_h = mean_absolute_error(test_trues[:, h], test_preds[:, h])
    print(f"  d√≠a +{h+1}: MAE={mae_h:.2f}")

# ============================================================
# GR√ÅFICAS
# ============================================================
print("\n" + "="*60)
print("GR√ÅFICAS")
print("="*60)

# Loss
plt.figure(figsize=(12, 4))
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='val loss')
plt.title('Transformer Training Loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('transformer_loss.png', dpi=150)
plt.show()

# Predicciones test (primeros 100 d√≠as para visualizar)
plt.figure(figsize=(14, 5))
plt.plot(test_trues[:20].flatten(), label='real', color='black')
plt.plot(test_preds[:20].flatten(), label='predicho', color='red', linestyle='--')
plt.title('Transformer - Predicciones Test (primeros 20 windows)')
plt.xlabel('d√≠as')
plt.ylabel('ventas')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('transformer_predictions.png', dpi=150)
plt.show()

# ============================================================
# RESUMEN FINAL
# ============================================================
print("\n" + "="*60)
print("RESUMEN FINAL")
print("="*60)

print(f"""
üìä TRANSFORMER - 4 A√ëOS / 1 A√ëO

producto: {PRODUCTO}
train: {TRAIN_DAYS} d√≠as ({train_data['date'].min().date()} ‚Üí {train_data['date'].max().date()})
test:  {TEST_DAYS} d√≠as ({test_data['date'].min().date()} ‚Üí {test_data['date'].max().date()})

resultados TEST:
   MAE:  {mae_te:.2f}
   RMSE: {rmse_te:.2f}
   MAPE: {mape_te:.2f}%

archivos:
   - modelo: best_transformer.pth
   - loss:   transformer_loss.png
   - pred:   transformer_predictions.png
""")

print("="*60)
print("‚úÖ NOTEBOOK 17 COMPLETADO")
print("="*60)

CONFIGURACI√ìN
device: cpu
lookback: 60, horizon: 7
train: 1460 d√≠as, test: 365 d√≠as

CARGANDO DATOS
producto: FOODS_3_090_CA_3_validation
registros: 486
rango: 2011-02-12 ‚Üí 2012-06-11


ValueError: Se necesitan 1825 d√≠as, pero solo hay 486