In [None]:
# Data processing
# ==============================================================================
import numpy as np
import pandas as pd

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import plotly.express as px
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modelling and Forecasting
# ==============================================================================
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregMultiOutput import ForecasterAutoregMultiOutput
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

from joblib import dump, load

# Configuration
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')
%config Completer.use_jedi = False

In [None]:
dx = pd.read_csv(r'/home/nkem/Documents/PhD_Research/dlam.csv')
dx['incidentdate'] = pd.to_datetime(dx['incidentdate'])
#df = dx[["incidentdate","spillno"]]
data = dx.copy()
data = data.set_index("incidentdate")

In [None]:
# split the data into a train dataframe and X_test and y_test dataframes, where the number of samples for test is equal to
# the number of periods the user wants to predict
end_train = '2021-01-31'
end_validation = '2021-07-31'

data_train = data.loc[: end_train, :]
data_val   = data.loc[end_train:end_validation, :]
data_test  = data.loc[end_validation:, :]

In [None]:
# Create forecaster
# ==============================================================================
forecaster = ForecasterAutoreg(
                regressor = XGBRegressor(random_state=123),
                lags = 24
                )

forecaster

ForecasterAutoreg 
Regressor: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, gamma=None,
             gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, predictor=None, random_state=123,
             reg_alpha=None, reg_lambda=None, ...) 
Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
Window size: 24 
Included exogenous: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: None 
Training index type: None 

In [62]:
# Grid search of hyperparameters and lags
# ==============================================================================
# Regressor hyperparameters
param_grid = {
    'n_estimators': [100, 1000],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1]
    }

# Lags used as predictors
lags_grid = [1, 2, 3, [1, 2, 3, 4, 12, 24, 48, 71, 72, 73]]


results_grid = grid_search_forecaster(
        forecaster         = forecaster,
        y                  = data.loc[:end_validation, 'spillno'], # Train and validation data
        param_grid         = param_grid,
        lags_grid          = lags_grid,
        steps              = 3,
        refit              = False,
        metric             = 'mean_absolute_error',
        initial_train_size = int(len(data_train)), # Model is trained with trainign data
        fixed_train_size   = False,
        return_best        = True,
        verbose            = False
        )

Number of models compared: 48


loop lags_grid: 100%|███████████████████████████████████████| 4/4 [01:55<00:00, 28.84s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1] 
  Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
  Backtesting metric: 5.37440808614095



In [63]:
# Backtesting
# ==============================================================================
metric, predictions = backtesting_forecaster(
    forecaster = forecaster,
    y          = data['spillno'],
    initial_train_size = len(data.loc[:end_validation]),
    fixed_train_size   = False,
    steps      = 3,
    refit      = False,
    metric     = 'mean_absolute_error',
    verbose    = False # Change to True to see detailed information
    )

print(f"Backtest error: {metric}")

Backtest error: 6.522022438049317


In [64]:
predictions

Unnamed: 0,pred
1,17.455532
2,17.455532
3,17.455532
1,13.754084
2,7.091336


In [65]:
data

Unnamed: 0_level_0,estimatedqty,spillno,year,month
incidentdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-01-31,100.05700,3,2005,January
2005-02-28,0.03000,1,2005,February
2005-03-31,3.06000,2,2005,March
2005-04-30,820.30040,32,2005,April
2005-05-31,59.00000,2,2005,May
...,...,...,...,...
2021-08-31,205.20000,14,2021,August
2021-09-30,13177.78000,21,2021,September
2021-10-31,3888.33140,15,2021,October
2021-11-30,129.51400,17,2021,November


In [66]:
# Store categorical variables as category type
# ==============================================================================
#data['estimatedqty'] = data['estimatedty'].astype('category')
data['month']   = data['month'].astype('category')
data['year'] = data['year'].astype('category')
data['year'] = data['year'].astype('category')
#data['day'] = data['day'].astype('category')

In [67]:
# One hot encoding
# ==============================================================================
data = pd.get_dummies(data, columns=['year', 'month'])
data.head(3)

Unnamed: 0_level_0,estimatedqty,spillno,year_2005,year_2006,year_2007,year_2008,year_2009,year_2010,year_2011,year_2012,...,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
incidentdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-31,100.057,3,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2005-02-28,0.03,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2005-03-31,3.06,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [68]:
# Select exogenous variables, including those generated by one hot encoding.
exog_variables = [column for column in data.columns
                      if column.startswith(('year', 'month'))]
#exog_variables.extend(['estimatedqty'])
print(exog_variables)

['year_2005', 'year_2006', 'year_2007', 'year_2008', 'year_2009', 'year_2010', 'year_2011', 'year_2012', 'year_2013', 'year_2014', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019', 'year_2020', 'year_2021', 'month_April', 'month_August', 'month_December', 'month_February', 'month_January', 'month_July', 'month_June', 'month_March', 'month_May', 'month_November', 'month_October', 'month_September']


In [69]:
# Since data has been transformed, the train, val and test split is repeated.
data_train = data.loc[: end_train, :]
data_val   = data.loc[end_train:end_validation, :]
data_test  = data.loc[end_validation:, :]

In [70]:
# Create forecaster
# ==============================================================================
forecaster = ForecasterAutoreg(
                regressor = XGBRegressor(random_state=123),
                lags = 3
                )

In [71]:
# Grid search of hyperparameters and lags
# ==============================================================================
# Regressor hyperparameters
param_grid = {
    'n_estimators': [100, 1000],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1]
    }

# Lags used as predictors
lags_grid = [1, 2, 3, [1, 2, 3, 4, 12, 24, 48, 71, 72, 73]]

results_grid = grid_search_forecaster(
                        forecaster         = forecaster,
                        y                  = data.loc[:end_validation, 'spillno'],
                        exog               = data.loc[:end_validation, exog_variables],
                        param_grid         = param_grid,
                        lags_grid          = lags_grid,
                        steps              = 36,
                        refit              = False,
                        metric             = 'mean_absolute_error',
                        initial_train_size = int(len(data_train)),
                        fixed_train_size   = False,
                        return_best        = True,
                        verbose            = False
                        )

Number of models compared: 48


loop lags_grid: 100%|███████████████████████████████████████| 4/4 [01:55<00:00, 28.81s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1] 
  Parameters: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
  Backtesting metric: 3.268744468688965



In [72]:
# Backtesting
# ==============================================================================
metric, predictions = backtesting_forecaster(
    forecaster = forecaster,
    y          = data['spillno'],
    initial_train_size = len(data.loc[:end_validation]),
    fixed_train_size   = False,
    steps      = 3,
    refit      = False,
    metric     = 'mean_absolute_error',
    verbose    = False # Change to True to see detailed information
    )

print(f"Backtest error: {metric}")

Backtest error: 9.468829345703124


In [73]:
predictions

Unnamed: 0,pred
1,16.443964
2,13.186886
3,4.363023
1,13.186886
2,4.363023


In [78]:
from skforecast.model_selection import random_search_forecaster

ImportError: cannot import name 'random_search_forecaster' from 'skforecast.model_selection' (/home/nkem/.local/lib/python3.8/site-packages/skforecast/model_selection/__init__.py)

In [77]:
# Grid search of hyperparameters and lags
# ==============================================================================
# Regressor hyperparameters
param_grid = {
    'n_estimators': range(10, 1000, 10),
    'max_depth': range(1, 10),
    'learning_rate': [0.01, 0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]

    }

# Lags used as predictors
lags_grid = range(1, 24)

results_grid = grid_search_forecaster(
                        forecaster         = forecaster,
                        y                  = data.loc[:end_validation, 'spillno'],
                        exog               = data.loc[:end_validation, exog_variables],
                        param_grid         = param_grid,
                        lags_grid          = lags_grid,
                        steps              = 3,
                        refit              = False,
                        metric             = 'mean_absolute_error',
                        initial_train_size = int(len(data_train)),
                        fixed_train_size   = False,
                        return_best        = True,
                        verbose            = False
                        )


TypeError: __init__() got an unexpected keyword argument 'forecaster'