In [1]:
# Data processing
# ==============================================================================
import numpy as np
import pandas as pd

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import plotly.express as px
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modelling and Forecasting
# ==============================================================================
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregMultiOutput import ForecasterAutoregMultiOutput
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

from joblib import dump, load

# Configuration
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
dx = dx = pd.read_csv(r'/home/nkem/Documents/PhD_Research/allN11Oct2022.csv')
dx['incidentdate'] = pd.to_datetime(dx['incidentdate'])
td = dx.copy()
dk = td.groupby([pd.Grouper(key='incidentdate', freq='M')])['estimatedqty'].agg(['sum','size'])
dk = dk.reset_index()
dk.rename(columns={"sum":"estimatedqty", "size":"spillno"}, inplace=True)

df = dk[["incidentdate","spillno"]]
df = df.set_index("incidentdate")
df = df.drop(index="2022-10-31")

df= df.reset_index()
df["month"] = df["incidentdate"].dt.month
df["year"] = df["incidentdate"].dt.year
dt = df[["incidentdate","spillno","month", "year"]]
dt = dt.set_index("incidentdate")

In [3]:
end_train = '2021-03-31'
start_val = '2021-04-30'
end_validation = '2022-03-31'
start_test = '2022-04-30'

data_train = dt.loc[: end_train, :]
data_val   = dt.loc[start_val:end_validation, :]
data_test  = dt.loc[start_test:, :]

print(f"Train dates      : {data_train.index.min()} --- {data_train.index.max()}")
print(f"Validation dates : {data_val.index.min()} --- {data_val.index.max()}")
print(f"Test dates       : {data_test.index.min()} --- {data_test.index.max()}")

Train dates      : 2005-01-31 00:00:00 --- 2021-03-31 00:00:00
Validation dates : 2021-04-30 00:00:00 --- 2022-03-31 00:00:00
Test dates       : 2022-04-30 00:00:00 --- 2022-09-30 00:00:00


In [4]:
data = dt.copy()

In [5]:
# Select exogenous variables, including those generated by one hot encoding.
exog_variables = [column for column in data.columns
                      if column.startswith(('year', 'month'))]
#exog_variables.extend(['estimatedqty'])
#print(exog_variables)

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score,mean_absolute_percentage_error

# Metrics
metrics = ['mean_squared_error', 'mean_absolute_error', 'mean_absolute_percentage_error']
from sklearn.ensemble import RandomForestRegressor
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import  random_search_forecaster
from skforecast.model_selection import  bayesian_search_forecaster
from sklearn.metrics import mean_squared_error

In [23]:
# Create forecaster
# ==============================================================================
forecaster = ForecasterAutoreg(
                regressor = LGBMRegressor(random_state=123),
                lags = 24
             )
#Lags used as predictors
lags_grid = [1,2,3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
forecaster

ForecasterAutoreg 
Regressor: LGBMRegressor(random_state=123) 
Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
Transformer for y: None 
Transformer for exog: None 
Window size: 24 
Included exogenous: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: None 
Training index type: None 
Training index frequency: None 
Regressor parameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 123, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0} 
Creation date: 2022-12-13 10:03:29 
Last fit date: None 
Skforecast version: 0.5.0 
Python version: 3.10.6 

In [24]:
# Regressor hyperparameters search space
def search_space(trial):
    search_space  = {'n_estimators'     : trial.suggest_int('n_estimators', 10, 5000),
                     'max_depth'        : trial.suggest_int('max_depth', 1, 100),
                     'learning_rate'    : trial.suggest_float('learning_rate', 0.001, 0.1),
                     'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
                    'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
                    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
                    'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
                    'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
                    'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
                    } 
    return search_space

results, frozen_trial = bayesian_search_forecaster(
                            forecaster            = forecaster,
                            y                     = data.loc[:end_validation, 'spillno'],  #data_train = data.loc[: end_train, :]
                            exog                  = data.loc[:end_validation, exog_variables],
                            lags_grid             = lags_grid,
                            search_space          = search_space,
                            steps                 = 6,
                            metric                = metrics,
                            refit                 = True,
                            initial_train_size    = len(data.loc[:end_train]),   
                            fixed_train_size      = False,
                            n_trials              = 10,
                            random_state          = 123,
                            return_best           = False,
                            verbose               = False,
                            engine                = 'optuna',
                            kwargs_create_study   = {},
                            kwargs_study_optimize = {}
                        )

results

Number of models compared: 120,
         10 bayesian search in each lag configuration.


loop lags_grid: 100%|█████████████████████████████████████| 12/12 [00:54<00:00,  4.55s/it]


Unnamed: 0,lags,params,mean_squared_error,mean_absolute_error,mean_absolute_percentage_error,n_estimators,max_depth,learning_rate,reg_alpha,reg_lambda,colsample_bytree,subsample,num_leaves,min_child_samples
9,[1],"{'n_estimators': 2915, 'max_depth': 21, 'learn...",49.401271,6.058492,0.253865,2915.0,21.0,0.072058,0.032805,0.471559,0.6,0.5,662.0,15.0
11,"[1, 2]","{'n_estimators': 4249, 'max_depth': 73, 'learn...",53.079062,6.341195,0.288627,4249.0,73.0,0.061491,0.775842,0.019581,0.6,0.8,624.0,35.0
1,[1],"{'n_estimators': 4249, 'max_depth': 73, 'learn...",59.250935,6.279003,0.259402,4249.0,73.0,0.061491,0.775842,0.019581,0.6,0.8,624.0,35.0
21,"[1, 2, 3]","{'n_estimators': 4249, 'max_depth': 73, 'learn...",60.120549,6.404949,0.265478,4249.0,73.0,0.061491,0.775842,0.019581,0.6,0.8,624.0,35.0
29,"[1, 2, 3]","{'n_estimators': 2915, 'max_depth': 21, 'learn...",87.176290,7.046878,0.338363,2915.0,21.0,0.072058,0.032805,0.471559,0.6,0.5,662.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'n_estimators': 3377, 'max_depth': 85, 'learn...",1250.233235,34.788425,1.628003,3377.0,85.0,0.009236,1.134309,0.009433,0.6,0.7,555.0,117.0
114,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'n_estimators': 4627, 'max_depth': 85, 'learn...",1250.233235,34.788425,1.628003,4627.0,85.0,0.036382,0.001494,0.016560,0.5,0.8,131.0,97.0
115,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'n_estimators': 3311, 'max_depth': 85, 'learn...",1250.233235,34.788425,1.628003,3311.0,85.0,0.055772,2.617038,0.034622,0.6,0.5,902.0,296.0
116,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'n_estimators': 1295, 'max_depth': 57, 'learn...",1250.233235,34.788425,1.628003,1295.0,57.0,0.080890,0.037799,0.840025,0.6,0.7,754.0,223.0


In [25]:
# Backtesting
# ==============================================================================
metric, predictions = backtesting_forecaster(
                            forecaster         = forecaster,
                            y                  = data['spillno'],
                            exog               = data[exog_variables],
                            initial_train_size = len(data.loc[:end_validation]),
                            fixed_train_size   = False,
                            steps              = 6,
                            refit              = True,
                            metric             = metrics,
                            verbose            = False
                      )

print(f"Backtest error: {metric}")

Backtest error: [484.11411038981555, 19.318060549594243, 0.5557008743680044]


In [26]:
# Predictions
# ==============================================================================
predictions = forecaster.predict(steps=6)


Exception: This Forecaster instance is not fitted yet. Call `fit` with appropriate arguments before using predict.