In [1]:
import os
from joblib import dump, load
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
from tscv import GapRollForward
from tqdm.notebook import tqdm

In [6]:
data = pd.read_csv('../data/intra_day/SA_preprocessed.csv')
data['year'] = data['dt'].str[:4]
fitdata = data[data['year'] == '2022']
testdata = data[data['year'] == '2023']
fitdata.shape, testdata.shape

(74880, 23)

In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from patsy import dmatrix

# Create basis functions for the smooth terms
knots_wtdtemp = np.linspace(min(fitdata['WtdTemp']), max(fitdata['WtdTemp']))
knots_year = np.linspace(0, max(fitdata['Year']))

def make_X(dataframe):
    basis_dsttime = dmatrix("cc(DSTTime, df=13)", {"DSTTime": dataframe['DSTTime']})
    basis_wtdtemp = dmatrix("bs(WtdTemp, knots=knots_wtdtemp)", {"WtdTemp": dataframe['WtdTemp']})
    basis_year = dmatrix("bs(Year, knots=knots_year)", {"Year": dataframe['Year']})
    return np.column_stack((basis_dsttime, basis_wtdtemp, basis_year))

# Fit the model
model = sm.GLM(fitdata['Demand'], make_X(fitdata))
result = model.fit()

result.summary().tables[0]

0,1,2,3
Dep. Variable:,Demand,No. Observations:,17520.0
Model:,GLM,Df Residuals:,17404.0
Model Family:,Gaussian,Df Model:,115.0
Link Function:,Identity,Scale:,42448.0
Method:,IRLS,Log-Likelihood:,-118150.0
Date:,"Wed, 06 Sep 2023",Deviance:,738850000.0
Time:,14:01:55,Pearson chi2:,739000000.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.9128
Covariance Type:,nonrobust,,


In [4]:
testdata.loc[84558, 'DSTTime'] = 0
testdata.loc[testdata['WtdTemp'] <= testdata['WtdTemp'].min(), 'WtdTemp'] = 6.7
testdata.loc[testdata['WtdTemp'] >= testdata['WtdTemp'].max(), 'WtdTemp'] = 38.3
pred2022 = result.predict(make_X(fitdata))
true2022 = fitdata['Demand']
pred2023 = result.predict(make_X(testdata))
true2023 = testdata['Demand']
def rsq(t, p): return 1 - np.sum((t - p)**2) / np.sum((t - np.mean(t))**2)
def rmse(true, pred): return np.sqrt(np.mean((true - pred)**2))
def mape(true, pred): return np.mean(np.abs((true - pred) / true))
def mae(true, pred): return np.mean(np.abs(true - pred))
def allmetrics(t, p, title):
    print(title)
    print('MAPE:', mape(t, p))
    print('R^2:', rsq(t, p))
    print('MAE:', mae(t, p))
    print('RMSE:', rmse(t, p))
allmetrics(true2022, pred2022, 'gam train')
allmetrics(true2023, pred2023, 'gam test')

gam train
MAPE: 0.1504096627176814
R^2: 0.7106383330855464
MAE: 153.10242820069263
RMSE: 205.35792582810714
gam test
MAPE: 0.27295909154558196
R^2: 0.4604802762026262
MAE: 249.84260618943233
RMSE: 320.0655893082919
