# Ensemble Demand Forecasting with Voting Regressor

This notebook uses Optuna to optimize a `VotingRegressor` ensemble, which combines predictions from XGBoost, RandomForest, and SVR models. The goal is to find the best hyperparameters for each base model and the optimal weights for combining them.

In [None]:
# Core Libraries
import numpy as np
import pandas as pd
import datetime as dt
import yaml
import pickle
import os
import json

# Modeling & Optimization
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
import optuna

# Our functions
import utils.voting_utils_gpu as utils

import warnings
warnings.filterwarnings('ignore')

# 1. Load Configuration

In [None]:
with open("params/voting_params.yml", "r") as f:
    configs = yaml.safe_load(f)

pd.options.display.max_columns = None

RANDOM_STATE = configs['RANDOM_STATE']
NUMBER_OF_FOLDS = configs['NUMBER_OF_FOLDS']
FEATURE_COLUMNS = configs['FEATURES']
LABEL_COLUMNS = configs['TARGET']
PREPROCESSING = configs['PREPROCESSING']
OPTUNA_PARAMS = configs['OPTUNA_PARAMS']
MODEL_RANGE_PARAMS = OPTUNA_PARAMS['MODELS']

# 2. Load and Prepare Data

In [None]:
print("Loading and preparing data...")
df = pd.read_parquet('../../data/processed/processed_data.parquet')
df['week_of_year'] = df['week_of_year'].astype(int)
df = df.sort_values(["week_of_year", "internal_product_id", "internal_store_id"]).reset_index(drop=True)

# Manual Pre-processing from original notebook
df = df[~pd.isna(df['premise'])]
df['premise'] = df['premise'].map({'Off Premise': 1, 'On Premise': 0}).astype(int)
cols_sum = df.columns[df.columns.str.contains("sum")]
df[cols_sum] = df[cols_sum].fillna(0)

print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

# 3. Feature Separation and Data Splitting

In [None]:
NUMERIC_COLUMNS = [col for col in df[FEATURE_COLUMNS].select_dtypes(np.number).columns if col not in PREPROCESSING.get('CYCLICAL_FEATURES', {})]
CATEGORICAL_COLUMNS = [col for col in df[FEATURE_COLUMNS].select_dtypes('object').columns]

# Use data before December for training/validation
X_train = df.loc[df.month != 12, FEATURE_COLUMNS]
y_train = df.loc[df.month != 12, [LABEL_COLUMNS]]

# Time-series cross-validation
tscv = TimeSeriesSplit(n_splits=NUMBER_OF_FOLDS)

print("Data split and CV folds created.")

## 4. Optimize Ensemble with Optuna

In [None]:
study = optuna.create_study(study_name='voting_regressor_optimization', directions=['minimize', 'minimize'])
study.set_metric_names(["score_validation", "score_overfitting"])

objective_function = utils.create_objective_function(
    hyperparam_ranges=MODEL_RANGE_PARAMS,
    tscv=tscv,
    x_data=[X_train, y_train],
    columns_to_use=[FEATURE_COLUMNS, LABEL_COLUMNS, NUMERIC_COLUMNS, CATEGORICAL_COLUMNS],
    preprocessing=PREPROCESSING,
    random_state=RANDOM_STATE
)

study.optimize(objective_function, n_trials=OPTUNA_PARAMS['TRIALS'])

print("Optimization finished.")

In [None]:
_, best_params = utils.get_best_params(study)
print("Best parameters found for the Voting Regressor ensemble:")
print(json.dumps(best_params, indent=2))

## 5. Fit Final Ensemble Model

In [None]:
print("Fitting final VotingRegressor with all available data...")

# Use all data from Jan-Dec for the final model
X_train_full = df[FEATURE_COLUMNS]
y_train_full = df[[LABEL_COLUMNS]]

# --- Reconstruct the optimized models and VotingRegressor ---

# Extract best params for each model
xgb_final_params = {k.replace('xgb_', ''): v for k, v in best_params.items() if k.startswith('xgb_')}
rf_final_params = {k.replace('rf_', ''): v for k, v in best_params.items() if k.startswith('rf_')}
svr_final_params = {k.replace('svr_', ''): v for k, v in best_params.items() if k.startswith('svr_')}
weights_final = [best_params['w_xgb'], best_params['w_rf'], best_params['w_svr']]

# Instantiate final models
final_xgb = XGBRegressor(**xgb_final_params, device='cuda', tree_method='hist', random_state=RANDOM_STATE)
final_rf = RandomForestRegressor(**rf_final_params, random_state=RANDOM_STATE, n_jobs=-1)
final_svr = SVR(**svr_final_params)

# Create the final Voting Regressor
final_voting_regressor = VotingRegressor(
    estimators=[('xgb', final_xgb), ('rf', final_rf), ('svr', final_svr)],
    weights=weights_final,
    n_jobs=-1
)

print("Final ensemble configured:")
print(final_voting_regressor)

# --- Build and fit the final pipeline ---
pipe_preproc = utils.create_preprocessing_pipeline(PREPROCESSING, NUMERIC_COLUMNS, CATEGORICAL_COLUMNS)
ttr = TransformedTargetRegressor(regressor=final_voting_regressor, func=np.log1p, inverse_func=np.expm1)
final_pipeline = Pipeline(steps=[('preprocessor', pipe_preproc), ('model', ttr)])

final_pipeline.fit(X_train_full, y_train_full.values.ravel())

print("\nFinal pipeline fitted successfully.")

## 6. Save Results

In [None]:
output_directory = f"outputs/{dt.datetime.now().strftime('%Y%m%d%H%M')}_VOTING_REGRESSOR/"
output_model_directory = os.path.join(output_directory, 'model')
output_params_directory = os.path.join(output_directory, 'params')
output_optuna_directory = os.path.join(output_directory, 'study')

for directory in [output_model_directory, output_params_directory, output_optuna_directory]:
    os.makedirs(directory, exist_ok=True)

print(f"Created output directory: {output_directory}")

In [None]:
# Save the final pipeline
with open(os.path.join(output_model_directory, 'pipeline.pkl'), 'wb') as f:
    pickle.dump(final_pipeline, f)

# Save the parameters used
with open(os.path.join(output_params_directory, 'experiment_params.json'), 'w') as f:
    # Convert numpy types to native python types for JSON serialization
    serializable_params = {k: (float(v) if isinstance(v, (np.floating, np.integer)) else v) for k, v in best_params.items()}
    configs_all = {**configs, 'BEST_PARAMS': serializable_params}
    json.dump(configs_all, f, indent=4)

# Save the Optuna study object
with open(os.path.join(output_optuna_directory, 'study.pkl'), 'wb') as f:
    pickle.dump(study, f)

print("All artifacts saved successfully.")