# Modeling

## Univariate ARIMA
We have got the p,d,q values from EDA to be 2,1,1 respectively.

In [60]:
import pandas as pd
import numpy as np
import os
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [61]:
path = os.getcwd()
path = path.replace('modeling', 'data_preprocessing')
files = os.listdir(path)
for file in files:
    if len(file.split('.csv'))>1:
        csv_path = path+'/'+file
data = pd.read_csv(csv_path)
data['date']=pd.to_datetime(data['date'])

In [62]:
# Univariate data
uv_data = data[data.columns[:2]].set_index(data.columns[0])
uv_data =  uv_data.rename(columns = {data.columns[1]:'target'})
uv_data.head(3)

Unnamed: 0_level_0,target
date,Unnamed: 1_level_1
2000-01-01,41.0
2000-02-01,41.0
2000-03-01,45.0


In [63]:
# Parameters
# ARIMA
p=2
d=1
q=1
# Seasonality
P=0
D=0
Q=0
s=12
import warnings
warnings.filterwarnings("ignore")


In [64]:
# Train-Test Split
test_split = 12
X_train_uvARIMA = uv_data.iloc[:-test_split]
X_test_uvARIMA = uv_data.iloc[-test_split:]

In [65]:
# Model building
ARIMA_model = SARIMAX(X_train_uvARIMA, order=(2, 1, 1), seasonal_order=(0, 0, 0, 12))
ARIMA_fit = ARIMA_model.fit(disp=False)

In [66]:
ARIMA_fit.summary()

0,1,2,3
Dep. Variable:,target,No. Observations:,264.0
Model:,"SARIMAX(2, 1, 1)",Log Likelihood,-729.649
Date:,"Thu, 01 Jun 2023",AIC,1467.298
Time:,20:41:01,BIC,1481.587
Sample:,01-01-2000,HQIC,1473.04
,- 12-01-2021,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,1.3504,0.046,29.116,0.000,1.259,1.441
ar.L2,-0.4332,0.036,-12.108,0.000,-0.503,-0.363
ma.L1,-0.9525,0.043,-22.060,0.000,-1.037,-0.868
sigma2,15.0122,0.429,34.988,0.000,14.171,15.853

0,1,2,3
Ljung-Box (L1) (Q):,0.05,Jarque-Bera (JB):,5430.91
Prob(Q):,0.82,Prob(JB):,0.0
Heteroskedasticity (H):,3.86,Skew:,-3.0
Prob(H) (two-sided):,0.0,Kurtosis:,24.44


In [97]:
# Forecasting
forecast = ARIMA_fit.get_forecast(steps=test_split)
forecasted_values = forecast.predicted_mean
predictions = pd.concat([forecasted_values.to_frame(),X_test_uvARIMA], axis=1)
predictions

Unnamed: 0,predicted_mean,target
2022-01-01,90.004843,93.0
2022-02-01,88.12606,93.0
2022-03-01,86.88638,97.0
2022-04-01,86.026158,97.0
2022-05-01,85.401514,100.0
2022-06-01,84.930622,100.0
2022-07-01,84.565309,97.0
2022-08-01,84.27597,93.0
2022-09-01,84.043491,90.0
2022-10-01,83.854885,90.0


In [68]:
# Define function to calcuate DA and MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def directional_accuracy(y_true, y_pred):
    return np.mean((np.sign(y_true[1:] - y_true[:-1].values) == np.sign(y_pred[1:] - y_pred[:-1].values))) * 100

In [103]:
predictions['predicted_mean'][:4]

2022-01-01    90.004843
2022-02-01    88.126060
2022-03-01    86.886380
2022-04-01    86.026158
Freq: MS, Name: predicted_mean, dtype: float64

In [114]:
n=12
directional_accuracy(predictions['target'][:n], predictions['predicted_mean'][:n])

27.27272727272727

In [73]:
ARIMA_fit.get_forecast(steps=2).predicted_mean

2022-01-01    90.004843
2022-02-01    88.126060
Freq: MS, Name: predicted_mean, dtype: float64

In [80]:
ARIMA_fit.predict(start='2022-01-01', end='2022-02-01')

2022-01-01    90.004843
2022-02-01    88.126060
Freq: MS, Name: predicted_mean, dtype: float64