In [0]:
%pip install prophet lightgbm prefect holidays tqdm --no-deps --quiet
%pip install -U opentelemetry-api --quiet
dbutils.library.restartPython()

In [0]:
# Importaciones
import sys
sys.path.append("/Workspace/Repos/desareca/santiago-weather-forecast")

from src.data.ingestion import load_from_delta_table
from src.data.preprocessing import prepare_time_series
from src.models.arima_model import ARIMAPredictor
from src.models.prophet_model import ProphetPredictor
from src.models.lightgbm_model import LightGBMPredictor
from src.evaluation.cross_validation import TimeSeriesSplit
from src.utils.config import *
import mlflow
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

mlflow.set_experiment(EXPERIMENT_NAME)
print("‚úÖ Setup completo")

In [0]:
# Cargar y preparar datos
df = load_from_delta_table("weather_raw", spark)
serie = prepare_time_series(df, target_col="precipitacion")

print(f"\nüìä Datos preparados:")
print(f"  Serie completa: {len(serie)} d√≠as")
print(f"  Fecha inicio: {serie.index.min().date()}")
print(f"  Fecha fin: {serie.index.max().date()}")

In [0]:
print("\n" + "="*70)
print("VISUALIZACI√ìN DE FOLDS")
print("="*70)

cv = TimeSeriesSplit(n_splits=5, test_size=30)
cv.visualize_splits(serie)

In [0]:
print("\n" + "="*70)
print("GRID SEARCH: ARIMA")
print("="*70)

# Grilla de hiperpar√°metros ARIMA
arima_grid = [
    # Modelos simples
    {'p': 1, 'd': 0, 'q': 0, 'name': 'AR(1)'},
    {'p': 0, 'd': 0, 'q': 1, 'name': 'MA(1)'},
    {'p': 1, 'd': 1, 'q': 0, 'name': 'ARIMA(1,1,0)'},
    {'p': 0, 'd': 1, 'q': 1, 'name': 'ARIMA(0,1,1)'},
    {'p': 1, 'd': 1, 'q': 1, 'name': 'ARIMA(1,1,1) - Baseline'},
    
    # Modelos con m√°s lags
    {'p': 2, 'd': 1, 'q': 1, 'name': 'ARIMA(2,1,1)'},
    {'p': 1, 'd': 1, 'q': 2, 'name': 'ARIMA(1,1,2)'},
    {'p': 2, 'd': 1, 'q': 2, 'name': 'ARIMA(2,1,2)'},
    
    # Modelos estacionales (pensando en ciclo anual)
    {'p': 1, 'd': 0, 'q': 1, 'name': 'ARMA(1,1) - Sin diferenciaci√≥n'},
    {'p': 3, 'd': 1, 'q': 1, 'name': 'ARIMA(3,1,1)'},
    {'p': 1, 'd': 1, 'q': 3, 'name': 'ARIMA(1,1,3)'},
    
    # Modelos m√°s complejos
    {'p': 2, 'd': 2, 'q': 2, 'name': 'ARIMA(2,2,2) - Dos diferenciaciones'},
    {'p': 3, 'd': 1, 'q': 3, 'name': 'ARIMA(3,1,3)'},
]

# Entrenar todos
results_arima_grid = []

for i, params in enumerate(arima_grid):
    print(f"\n[{i+1}/{len(arima_grid)}] Probando {params['name']}...")
    
    arima = ARIMAPredictor(p=params['p'], d=params['d'], q=params['q'])
    
    try:
        results_cv = arima.train_and_evaluate_cv(
            data=serie,
            n_splits=5,
            test_size=30,
            log_mlflow=True,
            run_description=f"ARIMA Grid Search - {params['name']}"
        )
        
        avg_metrics = results_cv[['mae', 'rmse', 'r2', 'f1_score']].mean()
        
        results_arima_grid.append({
            'model': params['name'],
            'p': params['p'],
            'd': params['d'],
            'q': params['q'],
            'mae': avg_metrics['mae'],
            'rmse': avg_metrics['rmse'],
            'r2': avg_metrics['r2'],
            'f1_score': avg_metrics['f1_score']
        })
        
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Error: {str(e)}")
        continue

# Resumen
df_arima_grid = pd.DataFrame(results_arima_grid)
df_arima_grid = df_arima_grid.sort_values('f1_score', ascending=False)

print("\n" + "="*70)
print("RESULTADOS GRID SEARCH ARIMA")
print("="*70)
print(df_arima_grid.to_string(index=False))
print(f"\nüèÜ Mejor ARIMA: {df_arima_grid.iloc[0]['model']} (F1={df_arima_grid.iloc[0]['f1_score']:.3f})")

In [0]:
print("\n" + "="*70)
print("GRID SEARCH: Prophet")
print("="*70)

# Grilla de hiperpar√°metros Prophet
prophet_grid = [
    # Variaciones de estacionalidad
    {
        'yearly_seasonality': True,
        'weekly_seasonality': False,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.05,
        'seasonality_prior_scale': 10.0,
        'name': 'Prophet - Solo anual (baseline)'
    },
    {
        'yearly_seasonality': True,
        'weekly_seasonality': True,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.05,
        'seasonality_prior_scale': 10.0,
        'name': 'Prophet - Anual + Semanal'
    },
    {
        'yearly_seasonality': True,
        'weekly_seasonality': False,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.01,  # Menos flexible
        'seasonality_prior_scale': 10.0,
        'name': 'Prophet - Conservador'
    },
    {
        'yearly_seasonality': True,
        'weekly_seasonality': False,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.1,  # M√°s flexible
        'seasonality_prior_scale': 10.0,
        'name': 'Prophet - Flexible'
    },
    {
        'yearly_seasonality': True,
        'weekly_seasonality': False,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.5,  # Muy flexible
        'seasonality_prior_scale': 10.0,
        'name': 'Prophet - Muy flexible'
    },
    
    # Variaciones de prior de estacionalidad
    {
        'yearly_seasonality': True,
        'weekly_seasonality': False,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.05,
        'seasonality_prior_scale': 1.0,  # Estacionalidad d√©bil
        'name': 'Prophet - Estacionalidad d√©bil'
    },
    {
        'yearly_seasonality': True,
        'weekly_seasonality': False,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.05,
        'seasonality_prior_scale': 20.0,  # Estacionalidad fuerte
        'name': 'Prophet - Estacionalidad fuerte'
    },
    
    # Combinaciones
    {
        'yearly_seasonality': True,
        'weekly_seasonality': True,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.1,
        'seasonality_prior_scale': 15.0,
        'name': 'Prophet - Flexible + Semanal'
    },
    {
        'yearly_seasonality': True,
        'weekly_seasonality': False,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.001,  # Muy r√≠gido
        'seasonality_prior_scale': 5.0,
        'name': 'Prophet - Muy conservador'
    },
    {
        'yearly_seasonality': True,
        'weekly_seasonality': True,
        'daily_seasonality': False,
        'changepoint_prior_scale': 0.2,
        'seasonality_prior_scale': 20.0,
        'name': 'Prophet - M√°xima flexibilidad'
    },
]

# Entrenar todos
results_prophet_grid = []

for i, params in enumerate(prophet_grid):
    print(f"\n[{i+1}/{len(prophet_grid)}] Probando {params['name']}...")
    
    prophet_model = ProphetPredictor(
        yearly_seasonality=params['yearly_seasonality'],
        weekly_seasonality=params['weekly_seasonality'],
        daily_seasonality=params['daily_seasonality'],
        changepoint_prior_scale=params['changepoint_prior_scale'],
        seasonality_prior_scale=params['seasonality_prior_scale']
    )
    
    try:
        results_cv = prophet_model.train_and_evaluate_cv(
            data=serie,
            n_splits=5,
            test_size=30,
            log_mlflow=True,
            run_description=f"Prophet Grid Search - {params['name']}"
        )

        avg_metrics = results_cv[['mae', 'rmse', 'r2', 'f1_score']].mean()
        
        results_prophet_grid.append({
            'model': params['name'],
            'yearly': params['yearly_seasonality'],
            'weekly': params['weekly_seasonality'],
            'cp_scale': params['changepoint_prior_scale'],
            'season_scale': params['seasonality_prior_scale'],
            'mae': avg_metrics['mae'],
            'rmse': avg_metrics['rmse'],
            'r2': avg_metrics['r2'],
            'f1_score': avg_metrics['f1_score']
        })
        
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Error: {str(e)}")
        continue

# Resumen
df_prophet_grid = pd.DataFrame(results_prophet_grid)
df_prophet_grid = df_prophet_grid.sort_values('f1_score', ascending=False)

print("\n" + "="*70)
print("RESULTADOS GRID SEARCH PROPHET")
print("="*70)
print(df_prophet_grid[['model', 'mae', 'rmse', 'r2', 'f1_score']].to_string(index=False))
print(f"\nüèÜ Mejor Prophet: {df_prophet_grid.iloc[0]['model']} (F1={df_prophet_grid.iloc[0]['f1_score']:.3f})")

In [0]:
print("\n" + "="*70)
print("CROSS-VALIDATION: LightGBM - Grid Search")
print("="*70)

# Grilla de hiperpar√°metros LightGBM
lightgbm_grid = [
    # Baseline simple
    {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 5,
        'num_leaves': 31,
        'min_child_samples': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.0,
        'reg_lambda': 0.0,
        'lags': [1, 7, 30],
        'rolling_windows': [7, 30],
        'name': 'Baseline'
    },
    
    # M√°s √°rboles, learning rate bajo
    {
        'n_estimators': 200,
        'learning_rate': 0.05,
        'max_depth': 5,
        'num_leaves': 31,
        'min_child_samples': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.0,
        'reg_lambda': 0.0,
        'lags': [1, 7, 30],
        'rolling_windows': [7, 30],
        'name': 'M√°s √°rboles (200)'
    },
    
    # √Årboles profundos
    {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 10,
        'num_leaves': 63,
        'min_child_samples': 10,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.0,
        'reg_lambda': 0.0,
        'lags': [1, 7, 30],
        'rolling_windows': [7, 30],
        'name': '√Årboles profundos'
    },
    
    # Conservador (menos overfitting)
    {
        'n_estimators': 150,
        'learning_rate': 0.08,
        'max_depth': 3,
        'num_leaves': 15,
        'min_child_samples': 30,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.0,
        'reg_lambda': 0.0,
        'lags': [1, 7, 30],
        'rolling_windows': [7, 30],
        'name': 'Conservador'
    },
    
    # Con regularizaci√≥n L1
    {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 5,
        'num_leaves': 31,
        'min_child_samples': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 1.0,
        'reg_lambda': 0.0,
        'lags': [1, 7, 30],
        'rolling_windows': [7, 30],
        'name': 'Regularizaci√≥n L1'
    },
    
    # Con regularizaci√≥n L2
    {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 5,
        'num_leaves': 31,
        'min_child_samples': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.0,
        'reg_lambda': 1.0,
        'lags': [1, 7, 30],
        'rolling_windows': [7, 30],
        'name': 'Regularizaci√≥n L2'
    },
    
    # M√°s lags
    {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 5,
        'num_leaves': 31,
        'min_child_samples': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.0,
        'reg_lambda': 0.0,
        'lags': [1, 2, 3, 7, 14, 30],
        'rolling_windows': [7, 14, 30],
        'name': 'M√°s lags (6)'
    },
    
    # Subsample bajo
    {
        'n_estimators': 150,
        'learning_rate': 0.1,
        'max_depth': 5,
        'num_leaves': 31,
        'min_child_samples': 20,
        'subsample': 0.5,
        'colsample_bytree': 0.5,
        'reg_alpha': 0.0,
        'reg_lambda': 0.0,
        'lags': [1, 7, 30],
        'rolling_windows': [7, 30],
        'name': 'Subsample bajo'
    },
]

# Entrenar todos
results_lgbm_grid = []

for i, params in enumerate(lightgbm_grid):
    print(f"\n[{i+1}/{len(lightgbm_grid)}] Probando LightGBM - {params['name']}...")
    
    lgbm = LightGBMPredictor(
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        num_leaves=params['num_leaves'],
        min_child_samples=params['min_child_samples'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        lags=params['lags'],
        rolling_windows=params['rolling_windows']
    )
    
    try:
        results_cv = lgbm.train_and_evaluate_cv(
            data=serie,
            n_splits=5,
            test_size=30,
            log_mlflow=True,
            run_description=f"LightGBM Grid Search - {params['name']}"
        )
        
        avg_metrics = results_cv[['mae', 'rmse', 'r2', 'f1_score']].mean()
        
        results_lgbm_grid.append({
            'model': params['name'],
            'n_estimators': params['n_estimators'],
            'learning_rate': params['learning_rate'],
            'max_depth': params['max_depth'],
            'n_lags': len(params['lags']),
            'mae': avg_metrics['mae'],
            'rmse': avg_metrics['rmse'],
            'r2': avg_metrics['r2'],
            'f1_score': avg_metrics['f1_score']
        })
        
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Error: {str(e)}")
        continue

# Resumen
df_lgbm_grid = pd.DataFrame(results_lgbm_grid)
df_lgbm_grid = df_lgbm_grid.sort_values('f1_score', ascending=False)

print("\n" + "="*70)
print("RESULTADOS GRID SEARCH LIGHTGBM")
print("="*70)
print(df_lgbm_grid[['model', 'mae', 'rmse', 'r2', 'f1_score']].to_string(index=False))
print(f"\nüèÜ Mejor LightGBM: {df_lgbm_grid.iloc[0]['model']} (F1={df_lgbm_grid.iloc[0]['f1_score']:.3f})")

In [0]:
print("\n" + "="*70)
print("COMPARACI√ìN FINAL: MEJOR ARIMA vs MEJOR PROPHET vs MEJOR LIGHTGBM")
print("="*70)

best_arima = df_arima_grid.iloc[0]
best_prophet = df_prophet_grid.iloc[0]
best_lgbm = df_lgbm_grid.iloc[0]

comparison_final = pd.DataFrame({
    'ARIMA': [best_arima['mae'], best_arima['rmse'], best_arima['r2'], best_arima['f1_score']],
    'Prophet': [best_prophet['mae'], best_prophet['rmse'], best_prophet['r2'], best_prophet['f1_score']],
    'LightGBM': [best_lgbm['mae'], best_lgbm['rmse'], best_lgbm['r2'], best_lgbm['f1_score']]
}, index=['MAE', 'RMSE', 'R¬≤', 'F1-Score'])

print("\nüìä Mejores modelos de cada familia:")
print(comparison_final.round(3))

print(f"\nü•á Mejor ARIMA: {best_arima['model']}")
print(f"   p={best_arima['p']}, d={best_arima['d']}, q={best_arima['q']}")

print(f"\nü•á Mejor Prophet: {best_prophet['model']}")
print(f"   changepoint_prior_scale={best_prophet['cp_scale']}")
print(f"   seasonality_prior_scale={best_prophet['season_scale']}")

print(f"\nü•á Mejor LightGBM: {best_lgbm['model']}")
print(f"   n_estimators={best_lgbm['n_estimators']}")
print(f"   learning_rate={best_lgbm['learning_rate']}")
print(f"   max_depth={best_lgbm['max_depth']}")
print(f"   n_lags={best_lgbm['n_lags']}")

# Ganador absoluto
winner_scores = {
    'ARIMA': best_arima['f1_score'],
    'Prophet': best_prophet['f1_score'],
    'LightGBM': best_lgbm['f1_score']
}

champion = max(winner_scores, key=winner_scores.get)

print(f"\n{'='*70}")
print(f"üèÜ CAMPE√ìN ABSOLUTO: {champion}")
print(f"{'='*70}")
if champion == 'ARIMA':
    print(f"   Configuraci√≥n: {best_arima['model']}")
    print(f"   F1-Score: {best_arima['f1_score']:.3f}")
    print(f"   Par√°metros: p={best_arima['p']}, d={best_arima['d']}, q={best_arima['q']}")
elif champion == 'Prophet':
    print(f"   Configuraci√≥n: {best_prophet['model']}")
    print(f"   F1-Score: {best_prophet['f1_score']:.3f}")
    print(f"   changepoint_prior_scale={best_prophet['cp_scale']}")
    print(f"   seasonality_prior_scale={best_prophet['season_scale']}")
else:  # LightGBM
    print(f"   Configuraci√≥n: {best_lgbm['model']}")
    print(f"   F1-Score: {best_lgbm['f1_score']:.3f}")
    print(f"   n_estimators={best_lgbm['n_estimators']}, lr={best_lgbm['learning_rate']}, depth={best_lgbm['max_depth']}")