In [10]:
# ============================================================================
# PREDICCIÓN AVANZADA NOCTURNA - MÚLTIPLES MODELOS Y SERIES DE TIEMPO
# ============================================================================
"""
Proyecto: Predicción de Ventas - Competencia Kaggle (Versión Nocturna Avanzada)
Objetivo: Predecir ventas para febrero 2020 (mes +2)
Modelos: RF, XGB, LGB, CatBoost, GradientBoosting, ExtraTrees, SVR,
         ARIMA, SARIMA, Prophet, AutoArima, Ensemble Avanzado
"""

# ============================================================================
# 1. IMPORTS Y CONFIGURACIÓN EXTENDIDA
# ============================================================================
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Básico
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
                             ExtraTreesRegressor, VotingRegressor)
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import (train_test_split, TimeSeriesSplit, 
                                   cross_val_score, GridSearchCV, RandomizedSearchCV)
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression

# Modelos avanzados
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

# Series de tiempo
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, kpss

# Prophet para series de tiempo
try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except ImportError:
    print("⚠️ Prophet no disponible, se omitirá")
    PROPHET_AVAILABLE = False

# AutoML para series de tiempo
try:
    from pmdarima import auto_arima
    AUTO_ARIMA_AVAILABLE = True
except ImportError:
    print("⚠️ Auto-ARIMA no disponible, se omitirá")
    AUTO_ARIMA_AVAILABLE = False

# Utilidades
from tqdm import tqdm
import pickle
import os
import joblib
from scipy import stats
from scipy.optimize import minimize
import itertools
from multiprocessing import Pool, cpu_count

# Hyperparameter optimization
import optuna
from sklearn.model_selection import ParameterGrid

print("📦 Librerías cargadas exitosamente!")

# ============================================================================
# 2. CONFIGURACIÓN GLOBAL EXTENDIDA
# ============================================================================

# Configuración
RANDOM_STATE = 42
TARGET_DATE = '2020-02-01'
VALIDATION_MONTHS = 2
OUTPUT_DIR = 'kaggle_predictions_advanced'
N_JOBS = -1  # Usar todos los cores disponibles

# Crear directorio de salida
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Configuración de modelos
MODEL_CONFIG = {
    'use_hyperparameter_tuning': True,
    'use_ensemble_stacking': True,
    'use_time_series_models': True,
    'use_prophet': PROPHET_AVAILABLE,
    'use_auto_arima': AUTO_ARIMA_AVAILABLE,
    'cross_validation_folds': 3,
    'optuna_trials': 100
}

# Configuración visual
plt.style.use('default')
sns.set_palette("husl")

print(f"🖥️ Configuración del sistema:")
print(f"   CPUs disponibles: {cpu_count()}")
print(f"   Directorio de salida: {OUTPUT_DIR}")
print(f"   Optimización de hiperparámetros: {MODEL_CONFIG['use_hyperparameter_tuning']}")
print(f"   Ensemble stacking: {MODEL_CONFIG['use_ensemble_stacking']}")
print(f"   Modelos de series de tiempo: {MODEL_CONFIG['use_time_series_models']}")

# ============================================================================
# 3. FUNCIONES DE CARGA Y PREPARACIÓN (MEJORADAS)
# ============================================================================

def load_and_prepare_data():
    """Carga y prepara todos los datasets con validaciones mejoradas"""
    print("🔄 Cargando datasets...")
    
    try:
        # Cargar datasets con validaciones
        sales = pd.read_csv("../datasets/sell-in.txt", sep="\t", dtype={"periodo": str})
        stocks = pd.read_csv("../datasets/tb_stocks.txt", sep="\t", dtype={"periodo": str}) 
        product_info = pd.read_csv("../datasets/tb_productos.txt", sep="\t")
        products_to_predict = pd.read_csv('../datasets/product_id_apredecir201912.txt')
        
        # Validaciones básicas
        assert not sales.empty, "Sales dataset está vacío"
        assert not products_to_predict.empty, "Products to predict está vacío"
        
        # Convertir periodos con validación
        sales['periodo'] = pd.to_datetime(sales['periodo'], format='%Y%m', errors='coerce')
        stocks['periodo'] = pd.to_datetime(stocks['periodo'], format='%Y%m', errors='coerce')
        
        # Eliminar fechas inválidas
        sales = sales.dropna(subset=['periodo'])
        stocks = stocks.dropna(subset=['periodo'])
        
        print(f"✅ Sales: {sales.shape[0]:,} filas, {sales.shape[1]} columnas")
        print(f"✅ Stocks: {stocks.shape[0]:,} filas, {stocks.shape[1]} columnas") 
        print(f"✅ Products: {product_info.shape[0]:,} productos")
        print(f"✅ Products to predict: {len(products_to_predict):,} productos")
        print(f"📅 Rango de fechas: {sales['periodo'].min()} a {sales['periodo'].max()}")
        
        return sales, stocks, product_info, products_to_predict
        
    except Exception as e:
        print(f"❌ Error cargando datos: {e}")
        return None, None, None, None

def load_indec_data():
    """Carga y procesa datos del INDEC con limpieza mejorada"""
    print("🇦🇷 Cargando datos del IPC INDEC...")
    
    try:
        INDEC = pd.read_csv('../datasets/serie_ipc_aperturas.csv', sep=';', encoding='latin-1')
        INDEC['periodo'] = INDEC['periodo'].astype(str)
        
        INDEC_filtered = INDEC[
            (INDEC['periodo'] >= '201701') & 
            (INDEC['periodo'] <= '202012') & 
            (INDEC['Descripcion_aperturas'] == 'Nivel general')
        ].copy()
        
        def clean_and_convert(value):
            if isinstance(value, str):
                try:
                    # Limpiar múltiples formatos posibles
                    cleaned = value.replace(',', '.').replace(' ', '')
                    # Extraer números
                    import re
                    numbers = re.findall(r'-?\d+\.?\d*', cleaned)
                    if numbers:
                        return float(numbers[0])
                    return np.nan
                except:
                    return np.nan
            return float(value) if pd.notna(value) else np.nan
        
        INDEC_filtered['v_m_IPC'] = INDEC_filtered['v_m_IPC'].apply(clean_and_convert)
        INDEC_filtered = INDEC_filtered.dropna(subset=['v_m_IPC'])
        INDEC_processed = INDEC_filtered.groupby('periodo')['v_m_IPC'].mean().reset_index()
        INDEC_processed['periodo'] = pd.to_datetime(INDEC_processed['periodo'], format='%Y%m')
        
        # Suavizado de valores extremos
        q1, q3 = INDEC_processed['v_m_IPC'].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        # Cap outliers
        INDEC_processed['v_m_IPC'] = np.clip(INDEC_processed['v_m_IPC'], lower_bound, upper_bound)
        
        print(f"✅ IPC INDEC procesado: {len(INDEC_processed)} períodos")
        print(f"📊 Rango IPC: {INDEC_processed['v_m_IPC'].min():.2f} a {INDEC_processed['v_m_IPC'].max():.2f}")
        
        return INDEC_processed
        
    except Exception as e:
        print(f"❌ Error procesando INDEC: {e}")
        return None

def create_advanced_features_v2(sales, stocks, product_info, indec_data):
    """Versión mejorada de feature engineering con más features"""
    print("🔧 Creando features avanzadas v2...")
    
    # Merge inicial con validaciones
    data = sales.copy()
    initial_shape = data.shape[0]
    
    # Agregar información de productos
    if product_info is not None:
        data = data.merge(product_info, on='product_id', how='left')
        print(f"   → Después de merge productos: {data.shape[0]} filas")
    
    # Agregar stocks
    if stocks is not None:
        data = data.merge(stocks, on=['periodo', 'product_id'], how='left')
        print(f"   → Después de merge stocks: {data.shape[0]} filas")
    
    # Agregar IPC
    if indec_data is not None:
        data = data.merge(indec_data, on='periodo', how='left')
        print(f"   → Después de merge IPC: {data.shape[0]} filas")
    
    # FEATURE ENGINEERING COMPLETO Y MEJORADO
    print("📊 Aplicando feature engineering avanzado...")
    
    # 1. Features temporales extendidas
    data['year'] = data['periodo'].dt.year
    data['month'] = data['periodo'].dt.month
    data['quarter'] = data['periodo'].dt.quarter
    data['day_of_year'] = data['periodo'].dt.dayofyear
    data['week_of_year'] = data['periodo'].dt.isocalendar().week
    data['is_weekend'] = data['periodo'].dt.dayofweek.isin([5, 6]).astype(int)
    data['is_month_start'] = data['periodo'].dt.is_month_start.astype(int)
    data['is_month_end'] = data['periodo'].dt.is_month_end.astype(int)
    data['is_quarter_start'] = data['periodo'].dt.is_quarter_start.astype(int)
    data['is_quarter_end'] = data['periodo'].dt.is_quarter_end.astype(int)
    data['days_in_month'] = data['periodo'].dt.days_in_month
    
    # 2. Features de estacionalidad múltiples
    for period in [3, 4, 6, 12]:
        data[f'sin_month_{period}'] = np.sin(2 * np.pi * data['month'] / period)
        data[f'cos_month_{period}'] = np.cos(2 * np.pi * data['month'] / period)
    
    # Features cíclicas para quarter
    data['sin_quarter'] = np.sin(2 * np.pi * data['quarter'] / 4)
    data['cos_quarter'] = np.cos(2 * np.pi * data['quarter'] / 4)
    
    # 3. Lags extendidos con múltiples targets
    print("🔄 Creando lags extendidos...")
    lag_periods = [1, 2, 3, 4, 5, 6, 9, 12, 15, 18, 24, 36]
    
    for lag in tqdm(lag_periods, desc="Creando lags"):
        data[f'sales_lag_{lag}'] = data.groupby(['product_id', 'customer_id'])['tn'].shift(lag)
        
    # Lags por producto solamente (agregados)
    for lag in [1, 3, 6, 12]:
        data[f'product_sales_lag_{lag}'] = data.groupby('product_id')['tn'].shift(lag)

    # 4. Rolling windows extendidos
    print("🔄 Creando rolling features extendidos...")
    windows = [2, 3, 4, 6, 9, 12, 18, 24]
    operations = ['mean', 'std', 'min', 'max', 'median', 'skew']
    
    for window in tqdm(windows, desc="Rolling windows"):
        # Por producto-cliente
        rolling_group = data.groupby(['product_id', 'customer_id'])['tn']
        data[f'sales_rolling_mean_{window}'] = rolling_group.transform(lambda x: x.rolling(window, min_periods=1).mean())
        data[f'sales_rolling_std_{window}'] = rolling_group.transform(lambda x: x.rolling(window, min_periods=1).std())
        data[f'sales_rolling_min_{window}'] = rolling_group.transform(lambda x: x.rolling(window, min_periods=1).min())
        data[f'sales_rolling_max_{window}'] = rolling_group.transform(lambda x: x.rolling(window, min_periods=1).max())
        data[f'sales_rolling_median_{window}'] = rolling_group.transform(lambda x: x.rolling(window, min_periods=1).median())
        
        # EWMA con diferentes alphas
        for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
            data[f'sales_ewma_{window}_alpha_{str(alpha).replace(".", "")}'] = rolling_group.transform(
                lambda x: x.ewm(alpha=alpha, min_periods=1).mean()
            )

    # 5. Features de tendencia y momentum
    print("📈 Creando features de tendencia...")
    
    # Diferencias y cambios porcentuales
    for lag in [1, 3, 6, 12]:
        data[f'sales_diff_{lag}'] = data.groupby(['product_id', 'customer_id'])['tn'].diff(periods=lag)
        data[f'sales_pct_change_{lag}'] = data.groupby(['product_id', 'customer_id'])['tn'].pct_change(periods=lag)
    
    # Momentum indicators
    for short, long in [(3, 6), (6, 12), (12, 24)]:
        data[f'momentum_{short}_{long}'] = (
            data[f'sales_rolling_mean_{short}'] - data[f'sales_rolling_mean_{long}']
        )
        
    # Acceleration (second derivative)
    data['sales_acceleration'] = data.groupby(['product_id', 'customer_id'])['tn'].diff().diff()
    
    # 6. Features estadísticas avanzadas
    print("📊 Creando features estadísticas...")
    
    # Ratios y volatilidad
    for window in [3, 6, 12]:
        data[f'cv_{window}'] = data[f'sales_rolling_std_{window}'] / (data[f'sales_rolling_mean_{window}'] + 1e-8)
        data[f'zscore_{window}'] = (data['tn'] - data[f'sales_rolling_mean_{window}']) / (data[f'sales_rolling_std_{window}'] + 1e-8)
        data[f'range_ratio_{window}'] = (data[f'sales_rolling_max_{window}'] - data[f'sales_rolling_min_{window}']) / (data[f'sales_rolling_mean_{window}'] + 1e-8)
    
    # 7. Agregaciones categóricas extendidas
    print("🏷️ Creando agregaciones categóricas...")
    
    categorical_cols = ['product_id', 'customer_id']
    if 'brand' in data.columns:
        categorical_cols.extend(['brand', 'cat1', 'cat2', 'cat3'])
    
    aggregations = ['mean', 'std', 'median', 'min', 'max', 'count', 'sum']
    
    for cat in tqdm(categorical_cols, desc="Agregaciones categóricas"):
        if cat in data.columns:
            grouped = data.groupby(cat)['tn']
            for agg in aggregations:
                try:
                    data[f'{cat}_{agg}'] = grouped.transform(agg)
                except:
                    continue
    
    # 8. Features de interacción avanzadas
    print("🔗 Creando features de interacción...")
    
    # Interacciones temporales
    data['sales_month_interaction'] = data['tn'] * data['month']
    data['sales_quarter_interaction'] = data['tn'] * data['quarter']
    data['sales_year_interaction'] = data['tn'] * (data['year'] - data['year'].min())
    
    # Interacciones con lags
    for lag in [1, 3, 6, 12]:
        data[f'sales_lag_ratio_{lag}'] = data['tn'] / (data[f'sales_lag_{lag}'] + 1e-8)
        data[f'sales_lag_diff_{lag}'] = data['tn'] - data[f'sales_lag_{lag}']
    
    # 9. Features de stock mejoradas
    if 'stock_final' in data.columns:
        print("📦 Creando features de stock avanzadas...")
        
        data['stock_turnover'] = data['tn'] / (data['stock_final'] + 1e-8)
        data['days_of_stock'] = data['stock_final'] / (data['tn'] + 1e-8) * 30
        data['stock_ratio'] = data['stock_final'] / (data['stock_final'].mean() + 1e-8)
        
        # Lags de stock
        for lag in [1, 3, 6]:
            data[f'stock_lag_{lag}'] = data.groupby(['product_id', 'customer_id'])['stock_final'].shift(lag)
            data[f'stock_change_{lag}'] = data['stock_final'] - data[f'stock_lag_{lag}']
    
    # 10. Features de IPC avanzadas
    if 'v_m_IPC' in data.columns:
        print("💰 Creando features de IPC avanzadas...")
        
        # Lags de IPC
        for lag in [1, 2, 3, 6, 12]:
            data[f'ipc_lag_{lag}'] = data['v_m_IPC'].shift(lag)
        
        # Rolling IPC
        for window in [3, 6, 12]:
            data[f'ipc_rolling_{window}'] = data['v_m_IPC'].rolling(window, min_periods=1).mean()
            data[f'ipc_std_{window}'] = data['v_m_IPC'].rolling(window, min_periods=1).std()
        
        # Interacciones IPC-ventas
        data['sales_ipc_interaction'] = data['tn'] * data['v_m_IPC']
        data['sales_ipc_ratio'] = data['tn'] / (data['v_m_IPC'] + 1e-8)
        
        # Cambios en IPC
        data['ipc_change'] = data['v_m_IPC'].diff()
        data['ipc_acceleration'] = data['ipc_change'].diff()
    
    # 11. Features de ranking y percentiles
    print("🏆 Creando features de ranking...")
    
    # Rankings por período
    data['product_rank_period'] = data.groupby('periodo')['tn'].rank(pct=True)
    data['customer_rank_period'] = data.groupby(['periodo', 'customer_id'])['tn'].rank(pct=True)
    
    # Rankings históricos
    data['product_rank_historical'] = data.groupby('product_id')['tn'].rank(pct=True)
    data['customer_rank_historical'] = data.groupby('customer_id')['tn'].rank(pct=True)
    
    # 12. Features de frecuencia y consistencia
    print("🔄 Creando features de frecuencia...")
    
    # Conteo de períodos activos
    data['periods_active'] = data.groupby(['product_id', 'customer_id']).cumcount() + 1
    data['total_periods'] = data.groupby(['product_id', 'customer_id'])['periodo'].transform('count')
    data['activity_ratio'] = data['periods_active'] / data['total_periods']
    
    # Consistencia de ventas
    for window in [6, 12]:
        # Porcentaje de períodos con ventas > 0
        rolling_group = data.groupby(['product_id', 'customer_id'])['tn']
        data[f'consistency_{window}'] = rolling_group.transform(
            lambda x: (x.rolling(window, min_periods=1) > 0).mean()
        )
    
    # 13. Features avanzadas de detección de patrones
    print("🔍 Creando features de patrones...")
    
    # Detectar picos y valles
    for window in [3, 6]:
        rolling_mean = data[f'sales_rolling_mean_{window}']
        rolling_std = data[f'sales_rolling_std_{window}']
        data[f'is_peak_{window}'] = (data['tn'] > rolling_mean + 2 * rolling_std).astype(int)
        data[f'is_valley_{window}'] = (data['tn'] < rolling_mean - rolling_std).astype(int)
    
    # Detectar estacionalidad
    data['seasonal_strength'] = np.abs(data['sin_month_12'])
    
    # 14. Features de target encoding
    print("🎯 Creando target encoding...")
    
    # Target encoding con validación cruzada para evitar overfitting
    for cat_col in ['cat1', 'cat2', 'cat3', 'brand']:
        if cat_col in data.columns:
            # Media global como fallback
            global_mean = data['tn'].mean()
            
            # Target encoding suavizado
            cat_stats = data.groupby(cat_col)['tn'].agg(['mean', 'count']).reset_index()
            cat_stats['smoothed_mean'] = (cat_stats['mean'] * cat_stats['count'] + global_mean * 10) / (cat_stats['count'] + 10)
            
            # Merge back
            data = data.merge(cat_stats[['cat1' if cat_col == 'cat1' else cat_col, 'smoothed_mean']], 
                            on=cat_col, how='left', suffixes=('', f'_{cat_col}_target'))
            data = data.rename(columns={'smoothed_mean': f'{cat_col}_target_encoded'})
    
    # 15. Limpieza final y validaciones
    print("🧹 Limpieza final...")
    
    # Reemplazar infinitos y valores extremos
    data = data.replace([np.inf, -np.inf], np.nan)
    
    # Imputación inteligente
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != 'tn':  # No imputar el target
            if data[col].isna().sum() > 0:
                # Usar mediana para imputación robusta
                data[col] = data[col].fillna(data[col].median())
    
    # Llenar categóricas
    categorical_cols = data.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        data[col] = data[col].fillna('unknown')
    
    print(f"✅ Features v2 creadas. Shape final: {data.shape}")
    print(f"📊 Features numéricas: {len(data.select_dtypes(include=[np.number]).columns)}")
    print(f"📊 Features categóricas: {len(data.select_dtypes(include=['object']).columns)}")
    
    return data

# ============================================================================
# 4. MODELOS DE SERIES DE TIEMPO
# ============================================================================

class TimeSeriesModels:
    """Clase para manejar modelos específicos de series de tiempo"""
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.models = {}
        self.fitted_models = {}
    
    def prepare_time_series_data(self, data, product_id):
        """Prepara datos para series de tiempo de un producto específico"""
        product_data = data[data['product_id'] == product_id].copy()
        product_data = product_data.sort_values('periodo')
        
        # Crear series temporal completa (rellenar períodos faltantes)
        date_range = pd.date_range(
            start=product_data['periodo'].min(),
            end=product_data['periodo'].max(),
            freq='M'
        )
        
        # Reindexar y rellenar
        product_data = product_data.set_index('periodo')
        product_data = product_data.reindex(date_range, fill_value=0)
        product_data.index.name = 'periodo'
        product_data = product_data.reset_index()
        
        return product_data
    
    def fit_arima(self, ts_data, order=(1,1,1)):
        """Ajusta modelo ARIMA"""
        try:
            if len(ts_data) < 10:
                return None, None
                
            # Eliminar ceros iniciales si existen
            first_nonzero = np.argmax(ts_data > 0)
            if first_nonzero > 0:
                ts_data = ts_data[first_nonzero:]
            
            if len(ts_data) < 5:
                return None, None
            
            model = ARIMA(ts_data, order=order)
            fitted_model = model.fit()
            return fitted_model, None
            
        except Exception as e:
            return None, str(e)
    
    def fit_sarima(self, ts_data, order=(1,1,1), seasonal_order=(0,1,1,12)):
        """Ajusta modelo SARIMA"""
        try:
            if len(ts_data) < 24:  # Necesita al menos 2 años para estacionalidad
                return None, "Insufficient data for SARIMA"
                
            model = SARIMAX(ts_data, order=order, seasonal_order=seasonal_order)
            fitted_model = model.fit(disp=False)
            return fitted_model, None
            
        except Exception as e:
            return None, str(e)
    
    def fit_prophet(self, data, product_id):
        """Ajusta modelo Prophet"""
        if not PROPHET_AVAILABLE:
            return None, "Prophet not available"
            
        try:
            product_data = self.prepare_time_series_data(data, product_id)
            
            if len(product_data) < 10:
                return None, "Insufficient data"
            
            # Preparar datos para Prophet
            prophet_data = pd.DataFrame({
                'ds': product_data['periodo'],
                'y': product_data['tn']
            })
            
            # Configurar Prophet
            model = Prophet(
                yearly_seasonality=True,
                weekly_seasonality=False,
                daily_seasonality=False,
                seasonality_mode='multiplicative',
                changepoint_prior_scale=0.05
            )
            
            # Agregar regresores externos si están disponibles
            if 'v_m_IPC' in product_data.columns:
                model.add_regressor('ipc')
                prophet_data['ipc'] = product_data['v_m_IPC'].fillna(method='ffill')
            
            model.fit(prophet_data)
            return model, None
            
        except Exception as e:
            return None, str(e)
    
    def fit_auto_arima(self, ts_data):
        """Ajusta Auto-ARIMA"""
        if not AUTO_ARIMA_AVAILABLE:
            return None, "Auto-ARIMA not available"
            
        try:
            if len(ts_data) < 10:
                return None, "Insufficient data"
            
            model = auto_arima(
                ts_data,
                start_p=0, start_q=0,
                max_p=3, max_q=3,
                seasonal=True,
                start_P=0, start_Q=0,
                max_P=2, max_Q=2,
                m=12,
                stepwise=True,
                suppress_warnings=True,
                error_action='ignore'
            )
            
            return model, None
            
        except Exception as e:
            return None, str(e)
    
    def predict_time_series(self, data, products_list, target_date='2020-02-01'):
        """Genera predicciones usando modelos de series de tiempo"""
        print("⏰ Generando predicciones con modelos de series de tiempo...")
        
        predictions = {}
        errors = {}
        
        for product_id in tqdm(products_list, desc="Time Series Predictions"):
            try:
                product_data = self.prepare_time_series_data(data, product_id)
                product_sales = data[data['product_id'] == product_id]['tn'].values
                
                if len(product_sales) == 0:
                    predictions[product_id] = {'arima': 0, 'sarima': 0, 'prophet': 0, 'auto_arima': 0}
                    continue
                
                pred_dict = {}
                
                # ARIMA
                arima_model, arima_error = self.fit_arima(product_sales)
                if arima_model is not None:
                    try:
                        forecast = arima_model.forecast(steps=1)
                        pred_dict['arima'] = max(0, forecast[0])
                    except:
                        pred_dict['arima'] = product_sales[-1] if len(product_sales) > 0 else 0
                else:
                    pred_dict['arima'] = product_sales[-1] if len(product_sales) > 0 else 0
                
                # SARIMA
                sarima_model, sarima_error = self.fit_sarima(product_sales)
                if sarima_model is not None:
                    try:
                        forecast = sarima_model.forecast(steps=1)
                        pred_dict['sarima'] = max(0, forecast[0])
                    except:
                        pred_dict['sarima'] = product_sales[-1] if len(product_sales) > 0 else 0
                else:
                    pred_dict['sarima'] = product_sales[-1] if len(product_sales) > 0 else 0
                
                # Prophet
                prophet_model, prophet_error = self.fit_prophet(data, product_id)
                if prophet_model is not None:
                    try:
                        future = prophet_model.make_future_dataframe(periods=1, freq='M')
                        if 'ipc' in future.columns:
                            future['ipc'] = future['ipc'].fillna(method='ffill')
                        forecast = prophet_model.predict(future)
                        pred_dict['prophet'] = max(0, forecast['yhat'].iloc[-1])
                    except:
                        pred_dict['prophet'] = product_sales[-1] if len(product_sales) > 0 else 0
                else:
                    pred_dict['prophet'] = product_sales[-1] if len(product_sales) > 0 else 0
                
                # Auto-ARIMA
                auto_arima_model, auto_arima_error = self.fit_auto_arima(product_sales)
                if auto_arima_model is not None:
                    try:
                        forecast = auto_arima_model.predict(n_periods=1)
                        pred_dict['auto_arima'] = max(0, forecast[0])
                    except:
                        pred_dict['auto_arima'] = product_sales[-1] if len(product_sales) > 0 else 0
                else:
                    pred_dict['auto_arima'] = product_sales[-1] if len(product_sales) > 0 else 0
                
                predictions[product_id] = pred_dict
                
            except Exception as e:
                errors[product_id] = str(e)
                predictions[product_id] = {'arima': 0, 'sarima': 0, 'prophet': 0, 'auto_arima': 0}
        
        return predictions, errors

# ============================================================================
# 5. ENSEMBLE AVANZADO Y STACKING
# ============================================================================

class AdvancedEnsemble:
    """Ensemble avanzado con stacking multinivel"""
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.level1_models = {}
        self.level2_models = {}
        self.stacking_model = None
        self.model_weights = {}
        
    def create_level1_models(self):
        """Crea modelos de primer nivel"""
        models = {
            'rf': RandomForestRegressor(
                n_estimators=200,
                max_depth=15,
                min_samples_split=10,
                min_samples_leaf=5,
                random_state=self.random_state,
                n_jobs=-1
            ),
            'xgb': xgb.XGBRegressor(
                n_estimators=300,
                max_depth=8,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=self.random_state,
                n_jobs=-1
            ),
            'lgb': lgb.LGBMRegressor(
                n_estimators=300,
                max_depth=8,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=self.random_state,
                n_jobs=-1,
                verbose=-1
            ),
            'catboost': CatBoostRegressor(
                iterations=300,
                depth=8,
                learning_rate=0.05,
                subsample=0.8,
                random_state=self.random_state,
                verbose=False
            ),
            'gb': GradientBoostingRegressor(
                n_estimators=200,
                max_depth=8,
                learning_rate=0.05,
                subsample=0.8,
                random_state=self.random_state
            ),
            'extra_trees': ExtraTreesRegressor(
                n_estimators=200,
                max_depth=15,
                min_samples_split=10,
                min_samples_leaf=5,
                random_state=self.random_state,
                n_jobs=-1
            ),
            'ridge': Ridge(alpha=1.0),
            'lasso': Lasso(alpha=0.1, random_state=self.random_state),
            'elastic': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=self.random_state)
        }
        
        self.level1_models = models
        return models
    
    def create_stacking_model(self):
        """Crea modelo de stacking de segundo nivel"""
        stacking_models = [
            ('ridge', Ridge(alpha=0.1)),
            ('lasso', Lasso(alpha=0.01, random_state=self.random_state)),
            ('xgb_stack', xgb.XGBRegressor(
                n_estimators=100,
                max_depth=4,
                learning_rate=0.1,
                random_state=self.random_state,
                n_jobs=-1
            ))
        ]
        
        self.stacking_model = VotingRegressor(
            estimators=stacking_models,
            n_jobs=-1
        )
        
        return self.stacking_model
    
    def fit_ensemble(self, X_train, y_train, X_val, y_val):
        """Entrena ensemble con validación cruzada"""
        print("🎯 Entrenando ensemble avanzado...")
        
        # Crear modelos
        self.create_level1_models()
        
        # Entrenar modelos de nivel 1
        level1_predictions_train = np.zeros((len(X_train), len(self.level1_models)))
        level1_predictions_val = np.zeros((len(X_val), len(self.level1_models)))
        
        model_scores = {}
        
        for i, (name, model) in enumerate(tqdm(self.level1_models.items(), desc="Training Level 1")):
            try:
                # Entrenar modelo
                model.fit(X_train, y_train)
                
                # Predicciones
                pred_train = model.predict(X_train)
                pred_val = model.predict(X_val)
                
                level1_predictions_train[:, i] = pred_train
                level1_predictions_val[:, i] = pred_val
                
                # Evaluar
                val_mae = mean_absolute_error(y_val, pred_val)
                val_rmse = np.sqrt(mean_squared_error(y_val, pred_val))
                val_r2 = r2_score(y_val, pred_val)
                
                model_scores[name] = {
                    'mae': val_mae,
                    'rmse': val_rmse,
                    'r2': val_r2
                }
                
                print(f"   ✅ {name}: MAE={val_mae:.3f}, RMSE={val_rmse:.3f}, R2={val_r2:.3f}")
                
            except Exception as e:
                print(f"   ❌ Error en {name}: {e}")
                level1_predictions_train[:, i] = y_train.mean()
                level1_predictions_val[:, i] = y_train.mean()
        
        # Entrenar modelo de stacking
        self.create_stacking_model()
        self.stacking_model.fit(level1_predictions_train, y_train)
        
        # Predicción final del ensemble
        final_pred = self.stacking_model.predict(level1_predictions_val)
        
        # Evaluar ensemble
        ensemble_mae = mean_absolute_error(y_val, final_pred)
        ensemble_rmse = np.sqrt(mean_squared_error(y_val, final_pred))
        ensemble_r2 = r2_score(y_val, final_pred)
        
        print(f"🏆 ENSEMBLE FINAL: MAE={ensemble_mae:.3f}, RMSE={ensemble_rmse:.3f}, R2={ensemble_r2:.3f}")
        
        return model_scores, {
            'mae': ensemble_mae,
            'rmse': ensemble_rmse,
            'r2': ensemble_r2
        }
    
    def predict_ensemble(self, X_test):
        """Genera predicciones del ensemble"""
        level1_predictions = np.zeros((len(X_test), len(self.level1_models)))
        
        for i, (name, model) in enumerate(self.level1_models.items()):
            try:
                level1_predictions[:, i] = model.predict(X_test)
            except:
                level1_predictions[:, i] = 0
        
        return self.stacking_model.predict(level1_predictions)

# ============================================================================
# 6. OPTIMIZACIÓN DE HIPERPARÁMETROS CON OPTUNA
# ============================================================================

def optimize_hyperparameters(X_train, y_train, X_val, y_val, model_type='xgb', n_trials=100):
    """Optimiza hiperparámetros usando Optuna"""
    print(f"🔧 Optimizando {model_type} con Optuna...")
    
    def objective(trial):
        if model_type == 'xgb':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
                'random_state': RANDOM_STATE,
                'n_jobs': -1
            }
            model = xgb.XGBRegressor(**params)
            
        elif model_type == 'lgb':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
                'random_state': RANDOM_STATE,
                'n_jobs': -1,
                'verbose': -1
            }
            model = lgb.LGBMRegressor(**params)
            
        elif model_type == 'catboost':
            params = {
                'iterations': trial.suggest_int('iterations', 100, 500),
                'depth': trial.suggest_int('depth', 3, 12),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
                'random_state': RANDOM_STATE,
                'verbose': False
            }
            model = CatBoostRegressor(**params)
        
        elif model_type == 'rf':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 300),
                'max_depth': trial.suggest_int('max_depth', 5, 20),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
                'random_state': RANDOM_STATE,
                'n_jobs': -1
            }
            model = RandomForestRegressor(**params)
        
        else:
            raise ValueError(f"Modelo {model_type} no soportado")
        
        # Entrenar y evaluar
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        return rmse
    
    # Crear estudio
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    print(f"   🏆 Mejor RMSE: {study.best_value:.4f}")
    print(f"   📊 Mejores parámetros: {study.best_params}")
    
    return study.best_params

# ============================================================================
# 7. PIPELINE PRINCIPAL EXTENDIDO
# ============================================================================

def run_advanced_nocturnal_pipeline():
    """Pipeline principal con todos los modelos y optimizaciones"""
    print("🚀 INICIANDO PIPELINE AVANZADO NOCTURNO")
    print("=" * 60)
    
    # 1. Carga de datos
    print("\n1️⃣ CARGA DE DATOS")
    sales, stocks, product_info, products_to_predict = load_and_prepare_data()
    if sales is None:
        print("❌ Error cargando datos principales")
        return
    
    indec_data = load_indec_data()
    
    # 2. Feature Engineering Avanzado
    print("\n2️⃣ FEATURE ENGINEERING AVANZADO")
    data = create_advanced_features_v2(sales, stocks, product_info, indec_data)
    
    # 3. Agregación y preparación
    print("\n3️⃣ AGREGACIÓN DE DATOS")
    print("📊 Agregando datos por producto-cliente...")
    
    # Filtrar datos hasta diciembre 2019
    data_filtered = data[data['periodo'] <= '2019-12-01'].copy()
    
    # Agregación mensual
    agg_data = data_filtered.groupby(['product_id', 'customer_id', 'periodo']).agg({
        'tn': 'sum',
        **{col: 'first' for col in data_filtered.columns if col not in ['tn', 'product_id', 'customer_id', 'periodo']}
    }).reset_index()
    
    print(f"✅ Datos agregados: {agg_data.shape}")
    
    # 4. Splits temporales
    print("\n4️⃣ CREACIÓN DE SPLITS TEMPORALES")
    cutoff_date = pd.to_datetime('2019-10-01')
    
    train_data = agg_data[agg_data['periodo'] <= cutoff_date]
    val_data = agg_data[agg_data['periodo'] > cutoff_date]
    
    print(f"📊 Training: {len(train_data):,} registros (hasta {cutoff_date.strftime('%Y-%m')})")
    print(f"📊 Validation: {len(val_data):,} registros (desde {cutoff_date.strftime('%Y-%m')})")
    
    # Preparar features
    feature_cols = [col for col in agg_data.columns if col not in ['tn', 'periodo', 'product_id', 'customer_id']]
    
    X_train = train_data[feature_cols]
    y_train = train_data['tn']
    X_val = val_data[feature_cols]
    y_val = val_data['tn']
    
    # Handling missing values y encoding
    print("🔧 Procesando features...")
    
    # Separar numéricas y categóricas
    numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    
    # Preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler())
            ]), numeric_features),
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
                ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ]), categorical_features)
        ])
    
    # Ajustar preprocessor
    X_train_processed = preprocessor.fit_transform(X_train)
    X_val_processed = preprocessor.transform(X_val)
    
    print(f"✅ Features procesadas: {X_train_processed.shape[1]} dimensiones")
    
    # 5. Modelos de Machine Learning
    print("\n5️⃣ EVALUACIÓN DE MODELOS DE ML")
    
    models_to_test = {
        'RandomForest': RandomForestRegressor(
            n_estimators=200, max_depth=15, min_samples_split=10,
            min_samples_leaf=5, random_state=RANDOM_STATE, n_jobs=-1
        ),
        'XGBoost': xgb.XGBRegressor(
            n_estimators=300, max_depth=8, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8, random_state=RANDOM_STATE, n_jobs=-1
        ),
        'LightGBM': lgb.LGBMRegressor(
            n_estimators=300, max_depth=8, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8, random_state=RANDOM_STATE, 
            n_jobs=-1, verbose=-1
        ),
        'CatBoost': CatBoostRegressor(
            iterations=300, depth=8, learning_rate=0.05,
            subsample=0.8, random_state=RANDOM_STATE, verbose=False
        ),
        'GradientBoosting': GradientBoostingRegressor(
            n_estimators=200, max_depth=8, learning_rate=0.05,
            subsample=0.8, random_state=RANDOM_STATE
        ),
        'ExtraTrees': ExtraTreesRegressor(
            n_estimators=200, max_depth=15, min_samples_split=10,
            min_samples_leaf=5, random_state=RANDOM_STATE, n_jobs=-1
        ),
        'Ridge': Ridge(alpha=1.0),
        'Lasso': Lasso(alpha=0.1, random_state=RANDOM_STATE),
        'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=RANDOM_STATE)
    }
    
    # Evaluar modelos
    ml_results = {}
    best_models = {}
    
    for name, model in tqdm(models_to_test.items(), desc="Evaluando modelos ML"):
        try:
            # Entrenar
            model.fit(X_train_processed, y_train)
            
            # Predecir
            y_pred = model.predict(X_val_processed)
            
            # Métricas
            mae = mean_absolute_error(y_val, y_pred)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))
            r2 = r2_score(y_val, y_pred)
            mape = mean_absolute_percentage_error(y_val, y_pred) * 100
            
            ml_results[name] = {
                'MAE': mae,
                'RMSE': rmse,
                'R2': r2,
                'MAPE': mape
            }
            
            best_models[name] = model
            
            print(f"✅ {name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}, MAPE: {mape:.1f}%")
            
        except Exception as e:
            print(f"❌ Error en {name}: {e}")
    
    # 6. Optimización de hiperparámetros (para los mejores modelos)
    if MODEL_CONFIG['use_hyperparameter_tuning']:
        print("\n6️⃣ OPTIMIZACIÓN DE HIPERPARÁMETROS")
        
        # Seleccionar top 3 modelos
        top_models = sorted(ml_results.items(), key=lambda x: x[1]['RMSE'])[:3]
        optimized_models = {}
        
        for model_name, _ in top_models:
            if model_name.lower() in ['xgboost', 'lightgbm', 'catboost']:
                print(f"🔧 Optimizando {model_name}...")
                
                try:
                    best_params = optimize_hyperparameters(
                        X_train_processed, y_train, X_val_processed, y_val,
                        model_type=model_name.lower().replace('boost', 'b').replace('gradient', 'gb'),
                        n_trials=MODEL_CONFIG['optuna_trials']
                    )
                    
                    # Crear modelo optimizado
                    if 'xg' in model_name.lower():
                        optimized_model = xgb.XGBRegressor(**best_params)
                    elif 'light' in model_name.lower():
                        optimized_model = lgb.LGBMRegressor(**best_params)
                    elif 'cat' in model_name.lower():
                        optimized_model = CatBoostRegressor(**best_params)
                    
                    # Evaluar modelo optimizado
                    optimized_model.fit(X_train_processed, y_train)
                    y_pred_opt = optimized_model.predict(X_val_processed)
                    
                    mae_opt = mean_absolute_error(y_val, y_pred_opt)
                    rmse_opt = np.sqrt(mean_squared_error(y_val, y_pred_opt))
                    r2_opt = r2_score(y_val, y_pred_opt)
                    mape_opt = mean_absolute_percentage_error(y_val, y_pred_opt) * 100
                    
                    optimized_models[f"{model_name}_Optimized"] = {
                        'model': optimized_model,
                        'MAE': mae_opt,
                        'RMSE': rmse_opt,
                        'R2': r2_opt,
                        'MAPE': mape_opt
                    }
                    
                    print(f"✅ {model_name} Optimizado - MAE: {mae_opt:.2f}, RMSE: {rmse_opt:.2f}, R2: {r2_opt:.3f}")
                    
                except Exception as e:
                    print(f"❌ Error optimizando {model_name}: {e}")
    
    # 7. Modelos de Series de Tiempo
    if MODEL_CONFIG['use_time_series_models']:
        print("\n7️⃣ MODELOS DE SERIES DE TIEMPO")
        
        ts_models = TimeSeriesModels(random_state=RANDOM_STATE)
        products_list = products_to_predict.iloc[:, 0].tolist()[:50]  # Limitamos a 50 para prueba nocturna
        
        ts_predictions, ts_errors = ts_models.predict_time_series(
            data_filtered, products_list, TARGET_DATE
        )
        
        print(f"✅ Predicciones de series de tiempo completadas para {len(ts_predictions)} productos")
        if ts_errors:
            print(f"⚠️ Errores en {len(ts_errors)} productos")
    
    # 8. Ensemble Avanzado
    if MODEL_CONFIG['use_ensemble_stacking']:
        print("\n8️⃣ ENSEMBLE AVANZADO")
        
        ensemble = AdvancedEnsemble(random_state=RANDOM_STATE)
        model_scores, ensemble_score = ensemble.fit_ensemble(
            X_train_processed, y_train, X_val_processed, y_val
        )
        
        # Agregar ensemble a resultados
        ml_results['AdvancedEnsemble'] = ensemble_score
        best_models['AdvancedEnsemble'] = ensemble
    
    # 9. Selección del mejor modelo
    print("\n9️⃣ SELECCIÓN DEL MEJOR MODELO")
    
    # Combinar resultados normales y optimizados
    all_results = ml_results.copy()
    for name, result in optimized_models.items():
        all_results[name] = {
            'MAE': result['MAE'],
            'RMSE': result['RMSE'],
            'R2': result['R2'],
            'MAPE': result['MAPE']
        }
        best_models[name] = result['model']
    
    # Crear DataFrame de resultados
    results_df = pd.DataFrame(all_results).T
    results_df = results_df.sort_values('RMSE')
    
    print("\n📊 RESUMEN DE RESULTADOS")
    print("=" * 60)
    print(results_df.round(3))
    
    # Mejor modelo
    best_model_name = results_df.index[0]
    best_model = best_models[best_model_name]
    
    print(f"\n🏆 MEJOR MODELO: {best_model_name}")
    print(f"   RMSE: {results_df.loc[best_model_name, 'RMSE']:.3f}")
    print(f"   MAE: {results_df.loc[best_model_name, 'MAE']:.3f}")
    print(f"   R2: {results_df.loc[best_model_name, 'R2']:.3f}")
    
    # 10. Reentrenamiento con datos completos
    print("\n🔟 REENTRENAMIENTO CON DATOS COMPLETOS")
    print("🔄 Reentrenando el mejor modelo con todos los datos disponibles...")
    
    # Combinar train y validation
    X_full = np.vstack([X_train_processed, X_val_processed])
    y_full = np.concatenate([y_train, y_val])
    
    # Reentrenar
    best_model.fit(X_full, y_full)
    
    # 11. Predicciones finales
    print("\n1️⃣1️⃣ PREDICCIONES FINALES")
    print(f"🎯 Generando predicciones para {TARGET_DATE}...")
    
    # Preparar datos para predicción
    products_list = products_to_predict.iloc[:, 0].unique()
    predictions_final = []
    
    for product_id in tqdm(products_list, desc="Generando predicciones"):
        try:
            # Obtener último período del producto
            product_data = agg_data[agg_data['product_id'] == product_id]
            
            if len(product_data) == 0:
                # Usar predicción por defecto
                predictions_final.append({
                    'product_id': product_id,
                    'prediction': 0,
                    'method': 'default'
                })
                continue
            
            # Tomar el último registro disponible como base
            last_record = product_data.iloc[-1:][feature_cols]
            
            # Procesar features
            last_record_processed = preprocessor.transform(last_record)
            
            # Predecir
            if best_model_name == 'AdvancedEnsemble':
                pred = best_model.predict_ensemble(last_record_processed)[0]
            else:
                pred = best_model.predict(last_record_processed)[0]
            
            predictions_final.append({
                'product_id': product_id,
                'prediction': max(0, pred),  # No predicciones negativas
                'method': best_model_name
            })
            
        except Exception as e:
            predictions_final.append({
                'product_id': product_id,
                'prediction': 0,
                'method': 'error'
            })
    
    # Convertir a DataFrame
    predictions_df = pd.DataFrame(predictions_final)
    
    print(f"✅ Predicciones completadas:")
    print(f"   Total: {len(predictions_df)}")
    print(f"   Con modelo: {len(predictions_df[predictions_df['method'] != 'error'])}")
    print(f"   Errores: {len(predictions_df[predictions_df['method'] == 'error'])}")
    
    # 12. Guardar resultados
    print("\n1️⃣2️⃣ GUARDANDO RESULTADOS")
    
    # Crear submission
    # Crear submission
    submission = pd.DataFrame({
        'product_id': predictions_df['product_id'],
        'target_202002': predictions_df['prediction']
    })
    
    # Estadísticas de predicciones
    print(f"\n📈 ESTADÍSTICAS DE PREDICCIONES:")
    print(f"   Media: {submission['target_202002'].mean():.2f}")
    print(f"   Mediana: {submission['target_202002'].median():.2f}")
    print(f"   Std: {submission['target_202002'].std():.2f}")
    print(f"   Min: {submission['target_202002'].min():.2f}")
    print(f"   Max: {submission['target_202002'].max():.2f}")
    print(f"   Predicciones = 0: {(submission['target_202002'] == 0).sum()}")
    
    # Guardar submission
    submission_filename = f'submission_nocturnal_{best_model_name.lower()}_{pd.Timestamp.now().strftime("%Y%m%d_%H%M")}.csv'
    submission.to_csv(submission_filename, index=False)
    print(f"💾 Submission guardada: {submission_filename}")
    
    # Guardar resultados detallados
    results_filename = f'results_nocturnal_{pd.Timestamp.now().strftime("%Y%m%d_%H%M")}.csv'
    results_df.to_csv(results_filename)
    print(f"📊 Resultados detallados guardados: {results_filename}")
    
    # Guardar modelo
    if MODEL_CONFIG['save_model']:
        model_filename = f'best_model_nocturnal_{best_model_name.lower()}_{pd.Timestamp.now().strftime("%Y%m%d_%H%M")}.pkl'
        joblib.dump(best_model, model_filename)
        
        # Guardar también el preprocessor
        preprocessor_filename = f'preprocessor_nocturnal_{pd.Timestamp.now().strftime("%Y%m%d_%H%M")}.pkl'
        joblib.dump(preprocessor, preprocessor_filename)
        
        print(f"🤖 Modelo guardado: {model_filename}")
        print(f"🔧 Preprocessor guardado: {preprocessor_filename}")
    
    # 13. Análisis de feature importance
    if hasattr(best_model, 'feature_importances_'):
        print("\n1️⃣3️⃣ ANÁLISIS DE FEATURE IMPORTANCE")
        
        # Obtener nombres de features después del preprocessing
        feature_names = []
        
        # Features numéricas
        feature_names.extend(numeric_features)
        
        # Features categóricas (después de one-hot encoding)
        if categorical_features:
            cat_encoder = preprocessor.named_transformers_['cat'].named_steps['encoder']
            cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)
            feature_names.extend(cat_feature_names)
        
        # Crear DataFrame de importancias
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\n🔍 TOP 20 FEATURES MÁS IMPORTANTES:")
        print("=" * 50)
        for i, (_, row) in enumerate(importance_df.head(20).iterrows(), 1):
            print(f"{i:2d}. {row['feature']:<30} {row['importance']:.4f}")
        
        # Guardar importancias
        importance_filename = f'feature_importance_nocturnal_{pd.Timestamp.now().strftime("%Y%m%d_%H%M")}.csv'
        importance_df.to_csv(importance_filename, index=False)
        print(f"\n💾 Feature importance guardada: {importance_filename}")
    
    # 14. Métricas de calidad de predicción
    print("\n1️⃣4️⃣ MÉTRICAS DE CALIDAD")
    
    # Distribución de predicciones
    pred_stats = {
        'zero_predictions': (submission['target_202002'] == 0).sum(),
        'low_predictions': (submission['target_202002'] < 1).sum(),
        'medium_predictions': ((submission['target_202002'] >= 1) & (submission['target_202002'] < 10)).sum(),
        'high_predictions': (submission['target_202002'] >= 10).sum(),
        'extreme_predictions': (submission['target_202002'] >= 100).sum()
    }
    
    print(f"📊 Distribución de predicciones:")
    for key, value in pred_stats.items():
        percentage = (value / len(submission)) * 100
        print(f"   {key}: {value} ({percentage:.1f}%)")
    
    # Comparación con datos históricos
    historical_mean = data_filtered['tn'].mean()
    historical_median = data_filtered['tn'].median()
    
    print(f"\n📈 Comparación con históricos:")
    print(f"   Media histórica: {historical_mean:.2f}")
    print(f"   Media predicha: {submission['target_202002'].mean():.2f}")
    print(f"   Ratio: {submission['target_202002'].mean() / historical_mean:.2f}")
    print(f"   Mediana histórica: {historical_median:.2f}")
    print(f"   Mediana predicha: {submission['target_202002'].median():.2f}")
    
    # 15. Validación cruzada temporal (si hay tiempo)
    if MODEL_CONFIG['use_cross_validation']:
        print("\n1️⃣5️⃣ VALIDACIÓN CRUZADA TEMPORAL")
        
        cv_scores = []
        n_splits = 3
        
        # Crear splits temporales para CV
        periods = sorted(agg_data['periodo'].unique())
        split_size = len(periods) // (n_splits + 1)
        
        for i in range(n_splits):
            # Definir fechas de corte
            train_end_idx = (i + 1) * split_size
            val_start_idx = train_end_idx
            val_end_idx = train_end_idx + split_size
            
            if val_end_idx > len(periods):
                break
            
            train_end_date = periods[train_end_idx - 1]
            val_start_date = periods[val_start_idx]
            val_end_date = periods[val_end_idx - 1]
            
            # Crear splits
            cv_train = agg_data[agg_data['periodo'] <= train_end_date]
            cv_val = agg_data[(agg_data['periodo'] >= val_start_date) & 
                             (agg_data['periodo'] <= val_end_date)]
            
            if len(cv_train) == 0 or len(cv_val) == 0:
                continue
            
            try:
                # Preparar datos
                X_cv_train = cv_train[feature_cols]
                y_cv_train = cv_train['tn']
                X_cv_val = cv_val[feature_cols]
                y_cv_val = cv_val['tn']
                
                # Procesar
                X_cv_train_processed = preprocessor.fit_transform(X_cv_train)
                X_cv_val_processed = preprocessor.transform(X_cv_val)
                
                # Entrenar y evaluar
                cv_model = clone(best_model)
                cv_model.fit(X_cv_train_processed, y_cv_train)
                cv_pred = cv_model.predict(X_cv_val_processed)
                
                cv_rmse = np.sqrt(mean_squared_error(y_cv_val, cv_pred))
                cv_scores.append(cv_rmse)
                
                print(f"   Fold {i+1}: RMSE = {cv_rmse:.3f}")
                
            except Exception as e:
                print(f"   Error en fold {i+1}: {e}")
        
        if cv_scores:
            print(f"\n📊 CV Results:")
            print(f"   Mean RMSE: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")
            print(f"   Scores: {[f'{score:.3f}' for score in cv_scores]}")
    
    # 16. Resumen final
    print("\n" + "="*80)
    print("🎯 RESUMEN FINAL DEL PIPELINE NOCTURNO")
    print("="*80)
    print(f"⏰ Tiempo total de ejecución: {time.time() - start_time:.1f} segundos")
    print(f"🏆 Mejor modelo: {best_model_name}")
    print(f"📊 RMSE de validación: {results_df.loc[best_model_name, 'RMSE']:.3f}")
    print(f"📊 MAE de validación: {results_df.loc[best_model_name, 'MAE']:.3f}")
    print(f"📊 R² de validación: {results_df.loc[best_model_name, 'R2']:.3f}")
    print(f"🎯 Predicciones generadas: {len(submission)}")
    print(f"💾 Archivo de submission: {submission_filename}")
    print(f"✅ Pipeline completado exitosamente!")
    
    return {
        'best_model': best_model,
        'preprocessor': preprocessor,
        'results': results_df,
        'submission': submission,
        'predictions': predictions_df,
        'model_name': best_model_name
    }

# ============================================================================
# 8. EJECUCIÓN PRINCIPAL
# ============================================================================

if __name__ == "__main__":
    print("🌙 INICIANDO PIPELINE NOCTURNO AVANZADO")
    print("⏰ Hora de inicio:", pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    start_time = time.time()
    
    try:
        # Ejecutar pipeline principal
        results = run_advanced_nocturnal_pipeline()
        
        if results:
            print("\n🎉 PIPELINE COMPLETADO EXITOSAMENTE!")
            print(f"🏆 Mejor modelo: {results['model_name']}")
            print(f"📊 Submission generada con {len(results['submission'])} predicciones")
            
        else:
            print("\n❌ ERROR EN EL PIPELINE")
            
    except KeyboardInterrupt:
        print("\n⏹️ Pipeline interrumpido por el usuario")
    except Exception as e:
        print(f"\n💥 ERROR CRÍTICO: {e}")
        import traceback
        traceback.print_exc()
    finally:
        total_time = time.time() - start_time
        print(f"\n⏰ Tiempo total: {total_time:.1f} segundos ({total_time/60:.1f} minutos)")
        print("🌙 Fin del pipeline nocturno")

⚠️ Auto-ARIMA no disponible, se omitirá
📦 Librerías cargadas exitosamente!
🖥️ Configuración del sistema:
   CPUs disponibles: 8
   Directorio de salida: kaggle_predictions_advanced
   Optimización de hiperparámetros: True
   Ensemble stacking: True
   Modelos de series de tiempo: True
🌙 INICIANDO PIPELINE NOCTURNO AVANZADO
⏰ Hora de inicio: 2025-06-04 08:33:34
🚀 INICIANDO PIPELINE AVANZADO NOCTURNO

1️⃣ CARGA DE DATOS
🔄 Cargando datasets...
✅ Sales: 2,945,818 filas, 7 columnas
✅ Stocks: 13,691 filas, 3 columnas
✅ Products: 1,251 productos
✅ Products to predict: 780 productos
📅 Rango de fechas: 2017-01-01 00:00:00 a 2019-12-01 00:00:00
🇦🇷 Cargando datos del IPC INDEC...
✅ IPC INDEC procesado: 48 períodos
📊 Rango IPC: 1.13 a 5.68

2️⃣ FEATURE ENGINEERING AVANZADO
🔧 Creando features avanzadas v2...
   → Después de merge productos: 2945818 filas
   → Después de merge stocks: 2945818 filas
   → Después de merge IPC: 2945818 filas
📊 Aplicando feature engineering avanzado...
🔄 Creando lags ex

Creando lags: 100%|██████████| 12/12 [00:06<00:00,  1.95it/s]


🔄 Creando rolling features extendidos...


Rolling windows: 100%|██████████| 8/8 [1:44:50<00:00, 786.28s/it]


📈 Creando features de tendencia...
📊 Creando features estadísticas...
🏷️ Creando agregaciones categóricas...


Agregaciones categóricas: 100%|██████████| 6/6 [00:02<00:00,  2.13it/s]


🔗 Creando features de interacción...
📦 Creando features de stock avanzadas...
💰 Creando features de IPC avanzadas...
🏆 Creando features de ranking...
🔄 Creando features de frecuencia...

💥 ERROR CRÍTICO: '>' not supported between instances of 'Rolling' and 'int'


Traceback (most recent call last):
  File "C:\Users\m\AppData\Local\Temp\ipykernel_1640\3219245548.py", line 1436, in <module>
    results = run_advanced_nocturnal_pipeline()
  File "C:\Users\m\AppData\Local\Temp\ipykernel_1640\3219245548.py", line 939, in run_advanced_nocturnal_pipeline
    data = create_advanced_features_v2(sales, stocks, product_info, indec_data)
  File "C:\Users\m\AppData\Local\Temp\ipykernel_1640\3219245548.py", line 404, in create_advanced_features_v2
    data[f'consistency_{window}'] = rolling_group.transform(
                                    ~~~~~~~~~~~~~~~~~~~~~~~^
        lambda x: (x.rolling(window, min_periods=1) > 0).mean()
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\m\Documents\Labo3\labo3-2025r\.venv\Lib\site-packages\pandas\core\groupby\generic.py", line 517, in transform
    return self._transform(
           ~~~~~~~~~~~~~~~^
        func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
  


⏰ Tiempo total: 6434.7 segundos (107.2 minutos)
🌙 Fin del pipeline nocturno
