In [0]:
%pip install -r requirements.txt
%restart_python

In [0]:
from sktime.split import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error, mean_squared_error
from sktime.forecasting.residual_booster import ResidualBoostingForecaster
from sklearn.ensemble import GradientBoostingRegressor
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.ets import AutoETS
from sktime.forecasting.compose import TransformedTargetForecaster
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import random

# Set seeds
random.seed(42)
np.random.seed(42)

## Forecast for Total Household Deposits

This notebook generates future forecasts for total household deposits at the 2, 5, 8, and 14-month horizons using the most recent available data. 

Additionally, a bonus 1-month-ahead forecast is produced by leveraging the 2-month model with one data point held back — effectively generating a 2-month forecast from an earlier point in time (e.g., predicting September from July instead of August - latest data). This approach accounts for real-world data lags and provides an estimate for months where official data may not yet be available.


In [0]:
def create_features(df, target_col, predictor_cols, lags=[1,3,4, 6,12], rolling_windows=[3,4, 6,12]):
    """
    Creates lag-based and rolling statistical features for both the target and predictor columns.

    Parameters:
    - df (pd.DataFrame): Input DataFrame with 'date', target, and predictor columns
    - target_col (str): Name of the target column
    - predictor_cols (list): List of predictor column names
    - lags (list): Lags to apply for lag, diff, and pct change features
    - rolling_windows (list): Window sizes for rolling statistics (mean, std, min, max)

    Returns:
    - pd.DataFrame: DataFrame with engineered features and datetime features (year, month, quarter)
    """
    # Ensure date is in datetime format
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')
    df.set_index('date', inplace=True)

    new_features = {}

    # For each predictor (and target), compute lags, rolling stats, etc.
    for col in [target_col] + predictor_cols:
        # Lag features
        for lag in lags:
            new_features[f'{col}_lag{lag}'] = df[col].shift(lag) # lag

        # Differences and pct changes
        for lag in lags:
            new_features[f'{col}_diff{lag}'] = df[col].diff(lag) # difference
            if (df[col] != 0).all():
                new_features[f'{col}_roc{lag}'] = df[col].pct_change(lag) # rate of change

        # Rolling stats
        for win in rolling_windows:
            new_features[f'{col}_ma{win}'] = df[col].rolling(win).mean()
            new_features[f'{col}_std{win}'] = df[col].rolling(win).std()
            new_features[f'{col}_min{win}'] = df[col].rolling(win).min()
            new_features[f'{col}_max{win}'] = df[col].rolling(win).max()

    # Combine original and new features        
    df_new = pd.concat([df]+ [pd.DataFrame(new_features, index=df.index)], axis=1)

    # Add datetime features
    df_new['year'] = df_new.index.year
    df_new['month'] = df_new.index.month
    df_new['quarter'] = df_new.index.quarter

    # Drop rows with missing values caused by shifting/rolling
    df_new = df_new.dropna()
    df_new.reset_index(inplace=True)

    return df_new

def prepare_features(df, target, lags, rolling_windows, use_cols):
    """
    Wrapper around create_features to:
    - Generate features
    - Subset to required columns
    - Set proper monthly period index

    Parameters:
    - df (pd.DataFrame): Input DataFrame with 'date', target and predictors
    - target (str): Target column name
    - lags (list): List of lag values
    - rolling_windows (list): List of rolling window sizes
    - use_cols (list): Final features to retain

    Returns:
    - pd.DataFrame: Processed and indexed DataFrame ready for forecasting
    """

    predictor_cols = [x for x in df.columns if x not in ['date', target]]
    df = create_features(df, target, predictor_cols, lags=lags, rolling_windows=rolling_windows)
    df = df[['date', target] + use_cols].copy()
    df = df.set_index('date').asfreq('MS')  # Set frequency to monthly start
    df.index = df.index.to_period("M")  # Use PeriodIndex (monthly) as required for sktime
    return df

def build_forecaster(sp, n_estimators, learning_rate, max_depth, window_length, min_samples_leaf, subsample):
    """
    Builds the combined forecasting model (AutoETS + Gradient Boosting residual corrector).

    Parameters:
    - sp (int): Seasonal period (e.g. 12 for monthly data with yearly seasonality)
    - n_estimators (int): Number of boosting trees
    - learning_rate (float): Learning rate for GBT
    - max_depth (int): Max tree depth
    - window_length (int): Number of past time points used in the residual booster
    - min_samples_leaf (int): Minimum samples in leaf node
    - subsample (float): Fraction of samples used in boosting
     
    Returns:
    - sktime.forecasting.compose._pipeline.ForecasterPipeline: A composite forecaster object
    """
    base_forecaster = AutoETS(auto=True, sp=sp, n_jobs=-1)

    regressor = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_leaf = min_samples_leaf, 
        subsample = subsample, 
        random_state=42
    )
    
    residual_model = make_reduction(regressor, window_length=window_length, strategy="direct")
    residual_forecaster = TransformedTargetForecaster([
        ("scaler", MinMaxScaler()),
        ("regressor", residual_model)
    ])

    full_forecaster = ResidualBoostingForecaster(base_forecaster, residual_forecaster)
    return full_forecaster

def forecast_future(raw_df, target='household_deposits', use_cols=[], lags=[], rolling_windows=[], forecast_horizon=12, sp=12, n_estimators=200, learning_rate=0.01, max_depth=4, min_samples_leaf=1, subsample=1, window_length=160):
    """
    Fits the time series model for the specified horizon on the full data and predicts h steps ahead.

    Parameters:
    - raw_df (pd.DataFrame): Raw input DataFrame with 'date', target and predictors
    - target (str): Target variable to forecast
    - use_cols (list): Features to use based on the SHAP feature selection
    - lags (list): Lags to use based on the SHAP feature selection
    - rolling_windows (list): Rolling window sizes for features
    - forecast_horizon (int): Number of months ahead to forecast (e.g. 2 = 2-month ahead, from the data perpsective)
    - sp (int): Seasonal period (set to 12 for the deposits data as strong yearly seasonaility observed)
    - n_estimators, learning_rate, max_depth, min_samples_leaf, subsample: parameters for GBT
    - window_length (int): Number of past observations used in GBT residual correction

    Returns:
    - pd.DataFrame: DataFrame with forecasted value at future date index
    """

    # Genereate and prepare features
    df = prepare_features(raw_df, target, lags, rolling_windows, use_cols)

    # Fit full model on all data
    forecaster = build_forecaster(sp, n_estimators, learning_rate, max_depth, window_length, min_samples_leaf, subsample)
    y = df[target]
    X = df.drop(columns=[target])
    forecaster.fit(y, X, fh = [forecast_horizon])

    # Forecast h steps ahead
    y_pred = forecaster.predict(fh=[forecast_horizon], X=X)

    # Compute the forecast date
    forecast_date = (df.index[-1] + forecast_horizon).to_timestamp()

    forecast_df = pd.DataFrame({
        "forecast": [y_pred.values[0]]
    }, index=[forecast_date])

    return forecast_df

def get_forecast_params(h, param_dict, raw_df):
    """
    Returns:
    - parameters for model
    - adjusted raw_df (e.g. shortened if using earlier point for horizon 1)
    - actual forecast horizon to use (2 for horizon 1, otherwise the original)
    """
    if h == 1:
        params = param_dict[f'model_forecast_h2']  # Use same model as horizon 2
        df = raw_df[:-1].copy()        # Drop latest to simulate previous month
        actual_h = 2            # Still forecasting 2 months ahead
    else:
        params = param_dict[f'model_forecast_h{h}']
        df = raw_df.copy()
        actual_h = h
    return params, df, actual_h


In [0]:
## Read Data
raw_df = pd.read_csv('../data/processed/combined_total_household_data_interpolate.csv')
raw_df = raw_df.sort_values('date')

# Read config
with open("model_configs.json", "r") as f:
    param_dict = json.load(f)


In [0]:
# Initialize empty list to collect forecast DataFrames
all_forecasts = []
horizons = [1,2,5,8,14] 
target = 'household_deposits'

# Loop through each horizon
for h in horizons:
    print(f"Forecasting horizon: {h} month(s) ahead")

    # Get model params, data, and adjusted forecast horizon
    params, df_subset, actual_h = get_forecast_params(h, param_dict, raw_df)

    # Prepare DataFrame with required columns
    predictors = params['predictors']
    df_subset = df_subset[['date', target] + predictors].copy()

    # Generate the forecast
    forecast_df = forecast_future(
        raw_df=df_subset,
        target=target,
        use_cols=params['best_features'],
        lags=params['best_lags'],
        rolling_windows=params['best_rolling_windows'],
        forecast_horizon=actual_h,
        sp=params['sp'],
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        min_samples_leaf=params['min_samples_leaf'],
        subsample=params['subsample'],
        window_length=params['window_length']
    )

    # Label the forecast by the intended horizon (not the underlying `fh`)
    forecast_df["horizon"] = h
    all_forecasts.append(forecast_df)

# Combine all forecasts
final_forecast_df = pd.concat(all_forecasts).sort_index()
final_forecast_df
