# Preparation

Install and import library

In [1]:
!pip install sktime pandas numpy xgboost -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.5/35.5 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.7/145.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import DirectTabularRegressionForecaster, DirRecTabularRegressionForecaster, MultioutputTabularRegressionForecaster, RecursiveTabularRegressionForecaster
import time
import os

Read data

In [2]:
data = pd.read_excel('/kaggle/input/listrik-malay/data.xlsx', parse_dates=['time'], index_col='time').drop(columns=['No']).iloc[:-1].asfreq('h')
data.columns = ['consumption']
data = data.ffill()
data.columns = ['y']

Create exogeneous data (calendar features)

In [3]:
data['year'] = data.index.year
data['month'] = data.index.month
data['dayofyear'] = data.index.dayofyear
data['dayofmonth'] = data.index.day
data['dayofweek'] = data.index.dayofweek
data['hour'] = data.index.hour

Train test data and cross validation

In [4]:
X = data.drop(columns=['y'])
y = data['y']

RANGE_FOLD = 25

fold = []

for i in range(RANGE_FOLD):
    fold.append((X.iloc[i*24:16775+i*24], X.iloc[16775+i*24:16775+i*24+24], y.iloc[i*24:16775+i*24], y.iloc[16775+i*24:16775+i*24+24]))

X_train, X_test, y_train, y_test = fold[0]

print(len(X_train), len(X_test), len(y_train), len(y_test))

16775 24 16775 24


model training and prediction

In [7]:
model = xgb.XGBRegressor(random_state=42)
fh = ForecastingHorizon(np.arange(1, len(y_test) + 1))


direct = DirectTabularRegressionForecaster(model, window_length=7*3*24)
dirrec = DirRecTabularRegressionForecaster(model, window_length=7*3*24)
multi = MultioutputTabularRegressionForecaster(model, window_length=7*3*24)
recursive = RecursiveTabularRegressionForecaster(model, window_length=7*3*24)

In [None]:
def model_predict(
    model: DirectTabularRegressionForecaster
    | DirRecTabularRegressionForecaster
    | MultioutputTabularRegressionForecaster
    | RecursiveTabularRegressionForecaster,
    fold: list[tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]],
    fh: ForecastingHorizon,
    name: str,
    exogeneous: bool,
):
    """
    Perform time series forecasting using cross-validation with different sktime forecaster types.

    This function trains and evaluates forecasting models using time series cross-validation.
    It supports both exogenous and non-exogenous forecasting approaches and automatically
    saves predictions and error metrics to CSV files.

    Parameters
    ----------
    model : DirectTabularRegressionForecaster | DirRecTabularRegressionForecaster | MultioutputTabularRegressionForecaster | RecursiveTabularRegressionForecaster
        The forecasting model to use. Must be one of the supported sktime tabular regression forecasters:
        - DirectTabularRegressionForecaster: Direct forecasting strategy
        - DirRecTabularRegressionForecaster: Direct-Recursive hybrid strategy
        - MultioutputTabularRegressionForecaster: Multi-output forecasting strategy
        - RecursiveTabularRegressionForecaster: Recursive forecasting strategy

    fold : list[tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]]
        List of cross-validation folds. Each fold is a tuple containing:
        - X_train: Training exogenous features (pd.DataFrame)
        - X_test: Test exogenous features (pd.DataFrame)
        - y_train: Training target values (pd.Series)
        - y_test: Test target values (pd.Series)

    fh : ForecastingHorizon
        The forecasting horizon object from sktime that defines how many steps ahead to forecast.
        This determines the length of predictions made by the model.

    name : str
        Name identifier for the model. Used in output filenames and progress messages.
        Should be descriptive (e.g., "XGBoost_Direct", "XGBoost_Recursive").

    exogeneous : bool
        Whether to use exogenous features in forecasting:
        - True: Use exogenous features (X_train, X_test) for training and prediction
        - False: Use only target variable (y_train) for univariate forecasting

    Returns
    -------
    None
        This function does not return values. Instead, it saves results to CSV files in the 'predictions' directory.

    Side Effects
    ------------
    - Creates a 'predictions' directory if it doesn't exist
    - Saves prediction results to CSV files:
        - For exogenous models: '{name}_predictions_exo.csv' and '{name}_mape_exo.csv'
        - For non-exogenous models: '{name}_predictions.csv' and '{name}_mape.csv'
    - Prints progress messages for each fold being processed

    Files Created
    -------------
    predictions/{name}_predictions.csv or predictions/{name}_predictions_exo.csv
        Contains predictions for each fold in separate columns (Fold 1, Fold 2, etc.)

    predictions/{name}_mape.csv or predictions/{name}_mape_exo.csv
        Contains Mean Absolute Percentage Error (MAPE) for each fold in separate columns

    Notes
    -----
    - The function uses MAPE (Mean Absolute Percentage Error) as the evaluation metric
    - MAPE is calculated as: |actual - predicted| / actual * 100
    - Each fold's results are stored in separate columns of the output DataFrames
    - The function handles both exogenous and non-exogenous forecasting scenarios
    - Progress is printed for each fold being processed

    Examples
    --------
    >>> from sktime.forecasting.compose import DirectTabularRegressionForecaster
    >>> from sktime.forecasting.base import ForecastingHorizon
    >>> import xgboost as xgb
    >>>
    >>> # Create model and forecasting horizon
    >>> model = DirectTabularRegressionForecaster(xgb.XGBRegressor(), window_length=504)
    >>> fh = ForecastingHorizon([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
    >>>
    >>> # Run predictions with exogenous features
    >>> model_predict(model, fold, fh, "XGBoost_Direct", exogeneous=True)

    >>> # Run predictions without exogenous features
    >>> model_predict(model, fold, fh, "XGBoost_Direct", exogeneous=False)
    """
    predictions_df = pd.DataFrame()
    predictions_df_exo = pd.DataFrame()
    test_df = pd.DataFrame()

    for i, (X_train, X_test, y_train, y_test) in enumerate(fold):
        print(f"Processing fold {i + 1} - {name}")

        # Create predictions directory if it doesn't exist
        if not os.path.exists("predictions"):
            os.makedirs("predictions")
            print("Created 'predictions' directory")

        if exogeneous:
            model.fit(y_train, X=X_train, fh=fh)
            predictions_df_exo[f"Fold {i + 1}"] = model.predict(fh, X=X_test).values

            test_df[f"Fold {i + 1}"] = y_test.values

            mape_df_exo = np.abs(test_df - predictions_df_exo) / test_df * 100

            predictions_df_exo.to_csv(f"predictions/{name}_predictions_exo.csv")
            mape_df_exo.to_csv(f"predictions/{name}_mape_exo.csv")

        else:
            # Time non-exogenous predictions
            model.fit(y_train, fh=fh)
            predictions_df[f"Fold {i + 1}"] = model.predict(fh).values

            mape_df = np.abs(test_df - predictions_df) / test_df * 100
            predictions_df.to_csv(f"predictions/{name}_predictions.csv")
            mape_df.to_csv(f"predictions/{name}_mape.csv")


In [None]:
print("=== FORECASTING MODEL EXAMPLES ===\n")

# 1. Direct Tabular Regression Forecaster
print("1. Direct Tabular Regression Forecaster")
print("   - Strategy: Direct forecasting (separate model for each horizon step)")
print("   - Use case: When you want independent models for each prediction step")
print("   - Pros: No error propagation, parallel training possible")
print(
    "   - Cons: More models to train, doesn't capture temporal dependencies between steps\n"
)

# Example with exogenous features
print("Running Direct model with exogenous features...")
model_predict(direct, fold, fh, "XGBoost_Direct_Exo", exogeneous=True)

# Example without exogenous features
print("Running Direct model without exogenous features...")
model_predict(direct, fold, fh, "XGBoost_Direct_NoExo", exogeneous=False)

print("\n" + "=" * 50 + "\n")

# 2. DirRec Tabular Regression Forecaster
print("2. DirRec Tabular Regression Forecaster")
print("   - Strategy: Direct-Recursive hybrid (combines both approaches)")
print("   - Use case: When you want benefits of both direct and recursive methods")
print("   - Pros: Balances accuracy and computational efficiency")
print("   - Cons: More complex than pure direct or recursive\n")

# Example with exogenous features
print("Running DirRec model with exogenous features...")
model_predict(dirrec, fold, fh, "XGBoost_DirRec_Exo", exogeneous=True)

# Example without exogenous features
print("Running DirRec model without exogenous features...")
model_predict(dirrec, fold, fh, "XGBoost_DirRec_NoExo", exogeneous=False)

print("\n" + "=" * 50 + "\n")

# 3. Recursive Tabular Regression Forecaster
print("3. Recursive Tabular Regression Forecaster")
print("   - Strategy: Recursive forecasting (uses previous predictions as input)")
print("   - Use case: When temporal dependencies are important")
print("   - Pros: Captures temporal patterns, single model")
print("   - Cons: Error propagation, sequential prediction\n")

# Example with exogenous features
print("Running Recursive model with exogenous features...")
model_predict(recursive, fold, fh, "XGBoost_Recursive_Exo", exogeneous=True)

# Example without exogenous features
print("Running Recursive model without exogenous features...")
model_predict(recursive, fold, fh, "XGBoost_Recursive_NoExo", exogeneous=False)

print("\n" + "=" * 50 + "\n")

# 4. Multioutput Tabular Regression Forecaster
print("4. Multioutput Tabular Regression Forecaster")
print("   - Strategy: Multi-output regression (single model predicts all steps)")
print("   - Use case: When you want to predict all horizon steps simultaneously")
print("   - Pros: Single model, captures dependencies between steps")
print("   - Cons: Requires multi-output capable base model\n")

# Example with exogenous features
print("Running Multioutput model with exogenous features...")
model_predict(multi, fold, fh, "XGBoost_Multioutput_Exo", exogeneous=True)

# Example without exogenous features
print("Running Multioutput model without exogenous features...")
model_predict(multi, fold, fh, "XGBoost_Multioutput_NoExo", exogeneous=False)

print("\n" + "=" * 50)
print("All model examples completed!")
print("Check the 'predictions' directory for results.")


In [None]:
# Create and train the simple XGBoost model
print("Training simple XGBoost model...")
simple_model = xgb.XGBRegressor(random_state=42)

# Simple fit - just X_train and y_train (no forecasting horizon, no lagged values)
simple_model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
predictions = simple_model.predict(X_test)

# Calculate metrics
mape = np.abs(y_test - predictions) / y_test * 100
mae = np.abs(y_test - predictions).mean()
rmse = np.sqrt(((y_test - predictions) ** 2).mean())

print(f"\n=== RESULTS ===")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape.mean():.4f}%")
print(f"MAPE Range: {mape.min():.4f}% - {mape.max():.4f}%")

# Create results DataFrame
results_df = pd.DataFrame(
    {
        "Actual": y_test.values,
        "Predicted": predictions,
        "Error": y_test.values - predictions,
        "MAPE": mape.values,
    }
)

print(f"\n=== SAMPLE PREDICTIONS ===")
print(results_df.head(10))

# Save results
if not os.path.exists("predictions"):
    os.makedirs("predictions")

results_df.to_csv("predictions/simple_xgboost_results.csv")
print(f"\nResults saved to: predictions/simple_xgboost_results.csv")