# PyCaret is an open-source, low-code machine learning library in Python that automates machine learning workflows.

In [23]:
# Importing some of the required packages
from pycaret.time_series import *
import pandas as pd
import numpy as np

In [24]:
# Loading the cleaned dataset. Cleaning was done in a previous notebook
dx = pd.read_csv(r'/home/nkem/Documents/PhD_Research/allN11Oct2022.csv')
dx.head()

Unnamed: 0,incidentdate,year,month,day,company,contaminant,spillareahabitat,cause,estimatedqty
0,2005-01-05,2005,1,5,MPN,cr,of,ome,0.0568
1,2005-01-08,2005,1,8,MPN,cr,of,eqf,0.0002
2,2005-01-31,2005,1,31,NAOC,cr,la,cor,100.0
3,2005-02-08,2005,2,8,MPN,cr,of,eqf,0.03
4,2005-03-08,2005,3,8,MPN,cr,of,eqf,3.0


In [25]:
# Changing the incidentdate column to pandas datetime
dx['incidentdate'] = pd.to_datetime(dx['incidentdate'])
td = dx.copy()

# Downsampling from daily timeframe to the monthly timeframe
dk = td.groupby([pd.Grouper(key='incidentdate', freq='M')])['estimatedqty'].agg(['sum','size'])
dk = dk.reset_index()

# Renaming the columns
dk.rename(columns={"sum":"estimatedqty", "size":"spillno"}, inplace=True)

# Filtering out the estimatedqty column and dropping the last incomplete entry
df = dk[["incidentdate","spillno"]]
df = df.set_index("incidentdate")
df = df.drop(index="2022-10-31")

# Feauture enginneering - creating month and year column from the incidentdate column
df= df.reset_index()
df["month"] = df["incidentdate"].dt.month
df["year"] = df["incidentdate"].dt.year
dt = df[["incidentdate","spillno","month", "year"]]
dt = dt.set_index("incidentdate")
dt.tail()

Unnamed: 0_level_0,spillno,month,year
incidentdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-31,22,5,2022
2022-06-30,48,6,2022
2022-07-31,20,7,2022
2022-08-31,60,8,2022
2022-09-30,38,9,2022


In [29]:
data = dt.copy()
# data split into train & test sets
end_train = '2022-03-31'
start_test = '2022-04-30'
end_test = '2022-06-30'

data_train = dt.loc[: end_train, :]
data_test  = dt.loc[start_test: end_test]

print(f"Train dates      : {data_train.index.min()} --- {data_train.index.max()}")
print(f"Test dates       : {data_test.index.min()} --- {data_test.index.max()}")

Train dates      : 2005-01-31 00:00:00 --- 2022-03-31 00:00:00
Test dates       : 2022-04-30 00:00:00 --- 2022-06-30 00:00:00


In [31]:
# Select exogenous variables, including those generated by one hot encoding.
exog_variables = [column for column in dt.columns
                      if column.startswith(('year', 'month'))]
#exog_variables.extend(['estimatedqty'])
#print(exog_variables)

In [35]:
# For datasets with exogenous variables, a Pycaret experiment demands we explicitly specify the target variable. The Pycaret package helps to divide datasets into training and test datasets so there will not be need for the division above.

target = "spillno"
exog_vars = ['year', 'month']
include = [target] + exog_vars
data = data[include]
data.tail()

Unnamed: 0_level_0,spillno,year,month
incidentdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-31,22,2022,5
2022-06-30,48,2022,6
2022-07-31,20,2022,7
2022-08-31,60,2022,8
2022-09-30,38,2022,9


In [37]:
# Forecasting horizon is 3 months
FH= 3
metric = "rmse"

In [38]:
# Global figure settings for notebook
fig_kwargs = {"renderer": "notebook", "width": 1000, "height": 600}

In [39]:
# Pycaret experiment design
 
exp_auto = TSForecastingExperiment()

# enforce_exogenous=False --> Use multivariate forecasting when model supports it, else use univariate forecasting
exp_auto.setup(data=data, target=target, fh=FH, enforce_exogenous=False,fig_kwargs=fig_kwargs, session_id=24,use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,24
1,Target,spillno
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(213, 3)"
5,Transformed data shape,"(213, 3)"
6,Transformed train set shape,"(210, 3)"
7,Transformed test set shape,"(3, 3)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with

<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x7f97d5a122d0>