# Time Series Analysis with SARIMAX


Here we analyze 4 years of hourly energy consumption to find trends in energy consumption around hour of the day, day of the week, season of the year, etc. and also to examine factors like outside temperature and solar installations. The goal is to build to predict the energy consumption given parameters like day of the week, time of the day, season, holiday, local weather, solar installation, etc.

The energy consumption values can also be expected to depend on it’s previous lagged values because the energy consumption of a region shouldn’t be expected to change much in the next few hours except for any unexpected or unfortunate events. So we will add the lagged values of energy consumption as the X parameters and check if we can predict better using the past values (in addition to the variables that we had already added).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller,kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf
import statsmodels.api as sm
from pmdarima.arima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX

from green_city.utils import span
from green_city.utils import datetime2index, index2datetime
from green_city.plotting import plot_decomposition

import warnings
warnings.filterwarnings('ignore')

RSEED = 42

In [None]:
## MLFLOW ##
try:
    import mlflow
    from green_city.mlflow_config import get_mlflow_config

    flow_conf = get_mlflow_config()
    tracking_uri = flow_conf["TRACKING_URI"]
    mlflow.set_tracking_uri(flow_conf["TRACKING_URI"])
    mlflow.set_experiment(flow_conf["EXPERIMENT_NAME"]);
except:
    print("mlflow ui not active")

In [None]:
# ## DB CONNECTION ##
# from sqlalchemy import create_engine
# from decouple import Config, RepositoryEnv

# config = Config(RepositoryEnv("../.db_credentials"))

# db_connection_credentials = {
#     "database": config('POSTGRES_DB'),
#     "user": config('POSTGRES_USER'),
#     "password": config('POSTGRES_PASSWORD'),
#     "host": config('POSTGRES_HOST'),
#     "port": config('POSTGRES_PORT'),
# }
# DB_STRING = "postgresql://{user}:{password}@{host}:{port}/{database}".format(**db_connection_credentials)
# db = create_engine(DB_STRING)

In [None]:
building_nr = 5
column_to_predict = "net_load_kW"

#document wide parameters that don't change in experiments
#for logging to mlflow server
global_params = {
    "building nr": building_nr,
    "predicted feature": column_to_predict,
    "resolution": "daily",
}

In [None]:
# Load data
df = pd.read_csv(f"../data/preprocessed/Building_{building_nr}.csv").astype({'datetime': 'datetime64'}).set_index('datetime')

In [None]:
df.head()

In [None]:
y = df['net_load_kW']
df_train = df[['net_load_kW']].iloc[:(len(df)-365*24)]
df_test = df[['net_load_kW']].iloc[(len(df)-365*24):]

In [None]:
# Plot train and test data
fig, ax = plt.subplots(figsize=(12,4))
df_train[['net_load_kW']].plot(ax=ax, label='Train data')
df_test[['net_load_kW']].plot(ax=ax, label='Test data')
plt.legend()
plt.show()

In [None]:
y_to_train = y.iloc[:(len(y)-365*24)]
y_to_test = y.iloc[(len(y)-365*24):] # last year for testing

In [None]:
fig, ax = plt.subplots(figsize=(12,4))
y_to_train.plot(ax=ax)
y_to_test.plot(ax=ax)
plt.show()

### Seasonal decomposition (one year period)

In [None]:
decomp = seasonal_decompose(
    x=y_to_train, 
    model='additive', 
    two_sided=True,
    extrapolate_trend=True,
    period= 24*365)

plot_decomposition(decomp)

### Multiple seasonal decomposition with MSTL

In [None]:
# mstl = MSTL(y_to_train, periods=(24, 24*7, 24*365), stl_kwargs={"seasonal_deg": 0})
# res = mstl.fit() 

In [None]:
# # Start with the plot from the results object `res`
# plt.rc("figure", figsize=(10, 14))
# plt.rc("font", size=13)
# fig = res.plot()

### Stationarity test on first difference of train data

In [None]:
# stationarity_test(y_to_train.diff().dropna())

### ACF and PACF plots

In [None]:
fig, ax = plt.subplots(2,1, figsize=(12,8))
fig = sm.graphics.tsa.plot_acf(y_to_train, lags=50, zero=False, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(y_to_train, lags=50, zero=False, ax=ax[1])
plt.show()

In [None]:
fig, ax = plt.subplots(2,1, figsize=(12,8))
fig = sm.graphics.tsa.plot_acf(y_to_train.diff().dropna(), lags=50, zero=False, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(y_to_train.diff().dropna(), lags=50, zero=False, ax=ax[1])
plt.show()

In [None]:
fig, ax = plt.subplots(2,1, figsize=(12,8))
fig = sm.graphics.tsa.plot_acf(y_to_train.diff().dropna().diff(24).dropna(), lags=50, zero=False, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(y_to_train.diff().dropna().diff(24).dropna(), lags=50, zero=False, ax=ax[1])
plt.show()

### Add fourier terms

In [None]:
# Yearly, weekly and daily seasonality as exogenous variables for SARIMAX model
exog = pd.DataFrame({'date': y.index})
exog = exog.set_index(pd.PeriodIndex(exog['date'], freq='H'))
exog['year_sin365'] = np.sin(2 * np.pi * exog.index.dayofyear / 365)
exog['year_cos365'] = np.cos(2 * np.pi * exog.index.dayofyear / 365)
exog['year_sin365_2'] = np.sin(4 * np.pi * exog.index.dayofyear / 365)
exog['year_cos365_2'] = np.cos(4 * np.pi * exog.index.dayofyear / 365)
exog['week_sin365'] = np.sin(2 * np.pi * exog.index.dayofweek/7)
exog['week_cos365'] = np.cos(2 * np.pi * exog.index.dayofweek/7)
exog['week_sin365_2'] = np.sin(4 * np.pi * exog.index.dayofweek/7)
exog['week_cos365_2'] = np.cos(4 * np.pi * exog.index.dayofweek/7)
exog['hour_sin365'] = np.sin(2 * np.pi * df.index.hour/24)
exog['hour_cos365'] = np.cos(2 * np.pi * df.index.hour/24) 
exog['hour_sin365_2'] = np.sin(4 * np.pi * df.index.hour/24)
exog['hour_cos365_2'] = np.cos(4 * np.pi * df.index.hour/24) 
exog = exog.drop(columns=['date'])


### Auto_ARIMA to determine inputs for SARIMAX

Since we represent the hourly, weekly and yearly as Fourier terms, we fit the auto_arima model on only the first three months of the data.

In [None]:
y_to_train = y.loc['2008-01-02':'2008-03-31']
exog_to_train = exog.loc['2008-01-02':'2008-03-31']

model1 = auto_arima(y=y_to_train, start_p=1, start_q=1,
          max_p=2, max_q=2, d=1, 
          m=24, exogenous=exog_to_train, 
          seasonal=True, stationary=False,
          information_criterion='aic',
          stepwise=True, njobs=-1, trace=True,
          error_action='ignore',
          suppress_warnings=False)

Running auto_arima, the best model that was found was:

```python
Best model:  ARIMA(6,1,1)(0,0,2)[24] intercept
```

However for a similar dataset, [this notebook](https://nbviewer.org/github/pratha19/Springboard_capstone_project_1/blob/master/SDGE_energy_ML.ipynb#6.3) found the best model as:

```python
Best model:  ARIMA(1,1,2)(0,0,2)[24] intercept
```

We will try both models for the SARIMAX model.

## SARIMAX

### SARIMAX (6,1,1)(0,0,2,24) model on train data

In [None]:
# Fit model on train data
y_to_train = y.iloc[:(len(y)-365*24)]
y_to_test = y.iloc[(len(y)-365*24):]

# Seasonality as exogenous variables
exog_to_train = exog.iloc[:(len(y)-365*24)]
exog_to_test = exog.iloc[(len(y)-365*24):]

model_train = SARIMAX(df_train['net_load_kW'], order=(6, 1, 1), seasonal_order=(0, 0, 2, 24), exogenous=exog_to_train)

In [None]:
results = model_train.fit()

In [None]:
results.summary()

In [None]:
_ = results.plot_diagnostics(figsize=(12, 7))

In [None]:
y_pred_train = results.predict()
y_train = df_train['net_load_kW']

fig, ax = plt.subplots(figsize=(12,4))
y_pred_train.rolling(24).mean().plot(ax=ax)
y_train.rolling(24).mean().plot(ax=ax)

mae = mean_absolute_error(y_train, y_pred_train)
mse = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)
print(f"Mean Absolute Error (SARIMAX) = {mae.round(3)}")
print(f"Mean Squared Error (SARIMAX) = {mse.round(3)}")
print(f"R2 score (SARIMAX) = {r2.round(3)}")

In [None]:
# Examine the prediction for a few days
fig, ax = plt.subplots(figsize=(12,4))
y_pred_train['2008-01-02':'2008-01-06'].plot(ax=ax)
y_train.loc['2008-01-02':'2008-01-06'].plot(ax=ax)
plt.legend()

### SARIMAX (6,1,1)(0,0,2,24) forecast for 8 randomly chosen dates

In [None]:
days_to_forecast = pd.to_datetime(['2011-09-01 23:00:00', '2011-10-20 23:00:00', 
                                   '2011-01-09 06:00:00', '2011-10-22 21:00:00', 
                                   '2011-06-21 03:00:00', '2011-07-08 02:00:00', 
                                   '2011-08-19 00:00:00', '2011-04-15 07:00:00'])

predictions = list()
for day in days_to_forecast:
  y_to_train = df['net_load_kW'].loc[day+pd.DateOffset(months=-24):day]
  exog_to_train = exog.loc[day+pd.DateOffset(months=-24):day]

  model = SARIMAX(y_to_train, order=(6, 1, 1), seasonal_order=(0, 0, 2, 24), exogenous=exog_to_train)
  model_fit = model.fit()
  output = model_fit.forecast(steps=24)
  predictions.append(output)

In [None]:

for day_to_plot in range(len(days_to_forecast)):
    
    # Starting the MLFlow run
    #r = mlflow.start_run(run_name="sarimax_(6,1,1)(0,0,2,24)")
    #print("run-uuid:", r.info.run_uuid)

    fig, ax = plt.subplots(figsize=(12,4))
    predictions[day_to_plot].plot()
    df_test['net_load_kW'].loc[(days_to_forecast[day_to_plot]+pd.DateOffset(hours=1)): 
                                days_to_forecast[day_to_plot]+pd.DateOffset(hours=24)].plot()
    plt.legend()
    plt.show()

    y_pred = predictions[day_to_plot]
    y_test = df_test['net_load_kW'].loc[(days_to_forecast[day_to_plot]+pd.DateOffset(hours=1)): 
                                days_to_forecast[day_to_plot]+pd.DateOffset(hours=24)]

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error (SARIMAX) = {mae.round(3)}")
    print(f"Mean Squared Error (SARIMAX) = {mse.round(3)}")
    print(f"R2 score (SARIMAX) = {r2.round(3)}")

    ##########
    # MLFLOW #
    ##########
    # for k, v in global_params.items():
    #     mlflow.log_param(k, v)
    #     mlflow.log_param("model", "sarimax")
    #     mlflow.log_param("datetime", days_to_forecast[day_to_plot])
    #     mlflow.log_param("feature", "net_load_kW")
    #     mlflow.log_metric("mse", mse)
    #     mlflow.log_metric("mae", mae)
    #     mlflow.log_metric("r2_score", r2)
    # mlflow.end_run()

### SARIMAX(2, 1, 1)(1, 0, 1, 24) model on train data

In [None]:
# Fit model on train data
y_to_train = y.iloc[:(len(y)-365*24)]
y_to_test = y.iloc[(len(y)-365*24):]

# Seasonality as exogenous variables
exog_to_train = exog.iloc[:(len(y)-365*24)]
exog_to_test = exog.iloc[(len(y)-365*24):]

model_train2 = SARIMAX(df_train['net_load_kW'], order=(2, 1, 1), seasonal_order=(1, 0, 1, 24), exogenous=exog_to_train)

In [None]:
results2 = model_train2.fit()

In [None]:
results2.summary()

In [None]:
_ = results2.plot_diagnostics(figsize=(12, 7))

In [None]:
y_pred_train = results2.predict()
y_train = df_train['net_load_kW']

fig, ax = plt.subplots(figsize=(12,4))
y_pred_train.rolling(24).mean().plot(ax=ax)
y_train.rolling(24).mean().plot(ax=ax)

mae = mean_absolute_error(y_train, y_pred_train)
mse = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)
print(f"Mean Absolute Error (SARIMAX) = {mae.round(3)}")
print(f"Mean Squared Error (SARIMAX) = {mse.round(3)}")
print(f"R2 score (SARIMAX) = {r2.round(3)}")

In [None]:
# Examine the prediction for a few days
fig, ax = plt.subplots(figsize=(12,4))
y_pred_train['2008-01-02':'2008-01-06'].plot(ax=ax)
y_train.loc['2008-01-02':'2008-01-06'].plot(ax=ax)

### SARIMAX(2,1,1)(1,0,1,24) forecast for 8 randomly chosen dates

In [None]:
days_to_forecast = pd.to_datetime(['2011-09-01 23:00:00', '2011-10-20 23:00:00', 
                                   '2011-01-09 06:00:00', '2011-10-22 21:00:00', 
                                   '2011-06-21 03:00:00', '2011-07-08 02:00:00', 
                                   '2011-08-19 00:00:00', '2011-04-15 07:00:00'])

predictions = list()
for day in days_to_forecast:
  y_to_train = df['net_load_kW'].loc[day+pd.DateOffset(months=-24):day]
  exog_to_train = exog.loc[day+pd.DateOffset(months=-24):day]

  model = SARIMAX(y_to_train, order=(2, 1, 1), seasonal_order=(1, 0, 1, 24), exogenous=exog_to_train)
  model_fit = model.fit()
  output = model_fit.forecast(steps=24)
  predictions.append(output)

In [None]:
#forecasts = pd.DataFrame(columns=['runid','id','prediction'])

for day_to_plot in range(len(days_to_forecast)):

    # Starting the MLFlow run
    #r = mlflow.start_run(run_name="sarimax_(2,1,1)(1,0,1,24)")
    #print("run-uuid:", r.info.run_uuid)

    fig, ax = plt.subplots(figsize=(12,4))
    predictions[day_to_plot].plot()
    df_test['net_load_kW'].loc[(days_to_forecast[day_to_plot]+pd.DateOffset(hours=1)): 
                                days_to_forecast[day_to_plot]+pd.DateOffset(hours=24)].plot()
    plt.legend()
    plt.show()

    y_pred = predictions[day_to_plot]
    y_test = df_test['net_load_kW'].loc[(days_to_forecast[day_to_plot]+pd.DateOffset(hours=1)): 
                                days_to_forecast[day_to_plot]+pd.DateOffset(hours=24)]

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error (SARIMAX) = {mae.round(3)}")
    print(f"Mean Squared Error (SARIMAX) = {mse.round(3)}")
    print(f"R2 score (SARIMAX) = {r2.round(3)}")

    ##########
    # MLFLOW #
    ##########
    # for k, v in global_params.items():
    #     mlflow.log_param(k, v)
    #     mlflow.log_param("model", "sarimax")
    #     mlflow.log_param("datetime", days_to_forecast[day_to_plot])
    #     mlflow.log_param("feature", "net_load_kW")
    #     mlflow.log_metric("mse", mse)
    #     mlflow.log_metric("mae", mae)
    #     mlflow.log_metric("r2_score", r2)
    # mlflow.end_run()

    ##########
    # SQL DB #
    ##########
    # start_date = days_to_forecast[day_to_plot]+pd.DateOffset(hours=1)
    # end_date = days_to_forecast[day_to_plot]+pd.DateOffset(hours=24)
    # predict_dates = pd.date_range(start=start_date, end=end_date, freq='H')

    # forecast = pd.DataFrame(columns=['runid','id','prediction'])
    # forecast['prediction'] = y_pred
    # forecast['runid'] = r.info.run_uuid
    # forecast['id'] = np.array(datetime2index(predict_dates))
    # forecasts = pd.concat([forecasts, forecast], axis=0)

#forecasts.to_csv('sarimax_(2,1,1)(1,0,1,24).csv')