# SARIMAX

[![nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/gautamnaik1994/SalesForecasting_ML_CaseStudy/blob/main/notebooks/modelling/02.SARIMAX.ipynb?flush_cache=true)

In [106]:
import pandas as pd
import numpy as np
import duckdb as db
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import optuna
import warnings

warnings.filterwarnings('ignore')

from IPython.display import display, Markdown
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, MSTL
# mape
from sklearn.metrics import mean_absolute_percentage_error

# pd.set_option('plotting.backend', 'plotly')
# pio.renderers.default = "notebook_connected"
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
df = pd.read_parquet("../../data/processed/train_enhanced.parquet")
train_agg = pd.read_parquet("../../data/processed/train_agg.parquet")
train_region_code_agg = pd.read_parquet("../../data/processed/train_region_code_agg.parquet")
holiday_df= pd.read_csv("../../data/processed/holidays.csv")

In [3]:
train_region_code_agg

Unnamed: 0,Date,Region_Code,Total_Sales,Avg_Sales,Total_Orders,Avg_Orders,Num_Stores,Holiday,Total_Discounts
0,2018-01-01,R4,2286812,45736,2914,58,50,1,50
1,2018-01-01,R2,4436859,42256,5644,54,105,1,105
2,2018-01-01,R3,3527439,41017,4599,53,86,1,86
3,2018-01-01,R1,5094374,41084,6509,52,124,1,124
4,2018-01-02,R4,2545119,50902,3057,61,50,0,50
...,...,...,...,...,...,...,...,...,...
2059,2019-05-30,R4,1966320,39326,2829,57,50,0,4
2060,2019-05-31,R2,4351299,41441,6411,61,105,1,11
2061,2019-05-31,R4,1909319,38186,2746,55,50,1,1
2062,2019-05-31,R1,5900798,47587,9433,76,124,1,18


In [107]:
df = train_region_code_agg[["Region_Code" ,"Date", "Total_Sales" , "Holiday","Total_Discounts"]]

In [108]:
date_mapping = {date: idx + 1 for idx, date in enumerate(sorted(df['Date'].unique()))}
df['idx'] = df['Date'].map(date_mapping)
df = df.rename(columns={"Total_Sales": "y", "Region_Code": "unique_id", "Date": "ds"})
df=df.sort_values(by='ds')
df

Unnamed: 0,unique_id,ds,y,Holiday,Total_Discounts,idx
0,R4,2018-01-01,2286812,1,50,1
1,R2,2018-01-01,4436859,1,105,1
2,R3,2018-01-01,3527439,1,86,1
3,R1,2018-01-01,5094374,1,124,1
4,R4,2018-01-02,2545119,0,50,2
...,...,...,...,...,...,...
2059,R4,2019-05-30,1966320,0,4,515
2061,R4,2019-05-31,1909319,1,1,516
2062,R1,2019-05-31,5900798,1,18,516
2060,R2,2019-05-31,4351299,1,11,516


In [109]:
threshold = 0.8
train = df[df['idx'] <= df['idx'].max() * threshold]
test = df[df['idx'] > df['idx'].max() * threshold]

In [110]:
train

Unnamed: 0,unique_id,ds,y,Holiday,Total_Discounts,idx
0,R4,2018-01-01,2286812,1,50,1
1,R2,2018-01-01,4436859,1,105,1
2,R3,2018-01-01,3527439,1,86,1
3,R1,2018-01-01,5094374,1,124,1
4,R4,2018-01-02,2545119,0,50,2
...,...,...,...,...,...,...
1640,R3,2019-02-15,3300873,0,73,411
1644,R2,2019-02-16,4485144,0,93,412
1645,R4,2019-02-16,2120472,0,49,412
1646,R1,2019-02-16,6426930,0,108,412


In [111]:
test

Unnamed: 0,unique_id,ds,y,Holiday,Total_Discounts,idx
1648,R3,2019-02-17,4253736,0,83,413
1649,R1,2019-02-17,6858420,0,120,413
1650,R4,2019-02-17,2341383,0,49,413
1651,R2,2019-02-17,4888986,0,99,413
1652,R3,2019-02-18,3948027,0,78,414
...,...,...,...,...,...,...
2059,R4,2019-05-30,1966320,0,4,515
2061,R4,2019-05-31,1909319,1,1,516
2062,R1,2019-05-31,5900798,1,18,516
2060,R2,2019-05-31,4351299,1,11,516


In [112]:
prediction_window = test['idx'].max() - train['idx'].max()
prediction_window

104

In [113]:
train = train.drop(columns=['idx'], axis=1)
test = test.drop(columns=['idx'], axis=1)

In [137]:
models = [AutoARIMA(season_length=12, approximation=True), MSTL(season_length=[7,12, 30])]

In [138]:
fcst = StatsForecast( models=models, freq="D", n_jobs=-1)

In [139]:
fcst.fit(train);



In [140]:
test

Unnamed: 0,unique_id,ds,y,Holiday,Total_Discounts
1648,R3,2019-02-17,4253736,0,83
1649,R1,2019-02-17,6858420,0,120
1650,R4,2019-02-17,2341383,0,49
1651,R2,2019-02-17,4888986,0,99
1652,R3,2019-02-18,3948027,0,78
...,...,...,...,...,...
2059,R4,2019-05-30,1966320,0,4
2061,R4,2019-05-31,1909319,1,1
2062,R1,2019-05-31,5900798,1,18
2060,R2,2019-05-31,4351299,1,11


In [141]:
forecast = fcst.predict(h=prediction_window, level=[95], X_df=test.drop(columns=['y'], axis=1))

In [142]:
forecast

Unnamed: 0,unique_id,ds,AutoARIMA,AutoARIMA-lo-95,AutoARIMA-hi-95,MSTL,MSTL-lo-95,MSTL-hi-95
0,R1,2019-02-17,6666489.000,4.967450e+06,8365528.000,7467893.500,5.981623e+06,8954164.00
1,R1,2019-02-18,6439762.000,4.580992e+06,8298532.500,6061009.000,4.365292e+06,7756726.00
2,R1,2019-02-19,4717097.500,2.828473e+06,6605722.500,4690329.000,2.808332e+06,6572325.50
3,R1,2019-02-20,5539322.500,3.644876e+06,7433769.500,5017822.500,2.966392e+06,7069252.50
4,R1,2019-02-21,5380298.000,3.484707e+06,7275888.500,5001071.500,2.793172e+06,7208971.00
...,...,...,...,...,...,...,...,...
411,R4,2019-05-27,2244195.000,1.554406e+06,2933983.750,1733764.125,-9.883518e+05,4455880.00
412,R4,2019-05-28,2234609.750,1.544821e+06,2924398.500,1615796.000,-1.119552e+06,4351144.00
413,R4,2019-05-29,2052491.625,1.362703e+06,2742280.250,1741516.375,-1.007000e+06,4490033.00
414,R4,2019-05-30,1860788.375,1.171000e+06,2550577.000,1427047.500,-1.334575e+06,4188669.75


In [143]:
merged =test.merge(forecast, on=['unique_id', 'ds'], how='left')
print("MAPE using ARIMA: ", mean_absolute_percentage_error(merged['y'], merged['AutoARIMA']))
print("MAPE using MSTL: ", mean_absolute_percentage_error(merged['y'], merged['MSTL']))

MAPE using ARIMA:  0.1742272055815889
MAPE using MSTL:  0.1886102340077545


In [144]:
def plot(train, test, forecast, unique_id, model_name):
    filtered_train = train[train['unique_id'] == unique_id]
    filtered_test = test[test['unique_id'] == unique_id]
    filtered_forecast = forecast[forecast['unique_id'] == unique_id]

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=filtered_train['ds'], y=filtered_train['y'], mode='lines', name='Train'))
    fig.add_trace(go.Scatter(x=filtered_test['ds'], y=filtered_test['y'], mode='lines', name='Test'))
    fig.add_trace(go.Scatter(x=filtered_forecast['ds'], y=filtered_forecast[model_name], mode='lines', name='Forecast'))
    fig.add_trace(go.Scatter(
        x=filtered_forecast['ds'], 
        y=filtered_forecast[f"{model_name}-hi-95"], 
        mode='lines', 
        name='Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))

    # Add lower bound as an area plot
    fig.add_trace(go.Scatter(
        x=filtered_forecast['ds'], 
        y=filtered_forecast[f"{model_name}-lo-95"], 
        mode='lines', 
        name='Lower Bound',
        fill='tonexty',  # Fill area between this trace and the previous one
        fillcolor='rgba(0, 100, 80, 0.2)',  # Set fill color with opacity
        line=dict(width=0),
        showlegend=False
    ))


    fig.update_layout(title=f'Total Sales Forecast for {unique_id}', xaxis_title='Date', yaxis_title='Total Sales')
    fig.show()

plot(train, test, forecast, "R1", "AutoARIMA")
plot(train, test, forecast, "R2", "AutoARIMA")
plot(train, test, forecast, "R3", "AutoARIMA")
plot(train, test, forecast, "R4", "AutoARIMA")

In [136]:
plot(train, test, forecast, "R1", "MSTL")
plot(train, test, forecast, "R2", "MSTL")
plot(train, test, forecast, "R3", "MSTL")
plot(train, test, forecast, "R4", "MSTL")