# Forecast

## Setup

In [16]:
import sys
sys.path.append("..")

# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import scipy as sp

from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, ETS

from joblib import Parallel, delayed
from itertools import product
from tqdm import tqdm
from pathlib import Path

# Library settings
pd.options.display.max_columns = 999
plt.rcParams["figure.figsize"] = (16, 4)


  from tqdm.autonotebook import tqdm


## Prepare data

In [None]:
sales = pd.read_csv("../data/sales_train_evaluation.csv")
sales.head()


In [None]:
calendar = pd.read_csv("../data/calendar.csv", parse_dates=["date"])
calendar.head()

In [None]:
sell_prices = pd.read_csv("../data/sell_prices.csv")
sell_prices.head()

In [None]:
def prepare_data(sales, calendar, sell_prices):
    hierarchy_df = sales.iloc[:, 1:6].copy()
    df = sales.drop(columns=["id", "dept_id", "cat_id", "state_id"])
    df = df.reset_index().rename(columns={"index":"unique_id"})
    df_long = pd.melt(df, id_vars=df.columns[:3], var_name="d", value_name="sales")
    df_long = df_long.merge(calendar[["d", "wm_yr_wk"]], on="d", how="left")
    df_long = df_long.merge(sell_prices, on=["item_id", "store_id", "wm_yr_wk"], how="left")
    df_long["dollar_sales"] = df_long["sales"] * df_long["sell_price"]
    df_long.loc[df_long.sales == 0, "dollar_sales"] = 0
    df_long["d"] = df_long["d"].str[2:].astype("int")
    df = pd.pivot_table(df_long, values=["sales", "dollar_sales"], index=["d"], columns=["unique_id"])
    sales_arr = df["sales"].to_numpy().T
    dollar_sales_arr = df["dollar_sales"].to_numpy().T

    filepath = Path("../data/processed/bottom_sales_arr.npy")
    np.save(filepath, sales_arr)
    print(f"Data written to {filepath}")
    
    filepath = Path("../data/processed/bottom_dollar_sales_arr.npy")
    np.save(filepath, dollar_sales_arr)
    print(f"Data written to {filepath}")

    filepath = Path("../data/processed/hierarchy.csv")
    hierarchy_df.to_csv(filepath, index=False)
    print(f"Data written to {filepath}")
    
    return sales_arr, dollar_sales_arr, hierarchy_df

In [None]:
bottom_sales_arr, bottom_dollar_sales_arr, hierarchy_df = prepare_data(sales, calendar, sell_prices)

## Build S matrix

In [None]:
hierarchy_df = pd.read_csv("../data/processed/hierarchy.csv")
hierarchy_df.head()

In [None]:
agg_levels = (
    tuple(["TOTAL"]),
    tuple(['state_id']),
    tuple(['store_id']),
    tuple(['cat_id']),
    tuple(['dept_id']),
    tuple(['state_id', 'cat_id']),
    tuple(['state_id', 'dept_id']),
    tuple(['store_id', 'cat_id']),
    tuple(['store_id', 'dept_id']),
    tuple(['item_id']),
    tuple(['item_id', 'state_id']),
    tuple(['item_id', 'store_id']),
)

In [None]:
def generate_Smatrix(hierarchy_df, agg_levels, sparse=True):
    arr = np.array
    eye = np.eye
    stack = np.row_stack
    
    if sparse:
        arr = sp.sparse.csr_array
        eye = sp.sparse.eye
        stack = sp.sparse.vstack

    def build_row(hierarchy_df, level, comb):
        row = (hierarchy_df.loc[:, level] == comb).apply(lambda x: all(x), axis=1).astype("int16")
        return row
    
    top_row = np.ones(len(hierarchy_df))
    S_rows = [top_row]

    for level in agg_levels[1:-1]:
        print(level)
        combinations = hierarchy_df.loc[:, level].drop_duplicates().to_numpy()
        rows = Parallel(n_jobs=8)(
            delayed(build_row)(hierarchy_df, level, comb) for comb in combinations
        )
        S_rows.extend(rows)

    S_top = arr(S_rows)
    S_bottom = eye(len(hierarchy_df))
    S = stack([S_top, S_bottom])
    
    filepath = Path("../data/processed/S.npz")
    sp.sparse.save_npz(filepath, S)
    print(f"Data written to {filepath}")
    return S

In [None]:
S = generate_Smatrix(hierarchy_df, agg_levels)

## Generate all the timeseries from the hierarchy

In [None]:
bottom_sales_arr = np.load("../data/processed/bottom_sales_arr.npy")
bottom_dollar_sales_arr = np.load("../data/processed/bottom_dollar_sales_arr.npy")
S = sp.sparse.load_npz("../data/processed/S.npz")

In [None]:
def generate_Ymatrix(S, bottom_sales_arr, bottom_dollar_sales_arr):
    sales_arr = S @ bottom_sales_arr
    dollar_sales_arr = S @ bottom_dollar_sales_arr

    filepath = Path("../data/processed/sales_arr.npy")
    np.save(filepath, sales_arr)
    print(f"Data written to {filepath}")

    filepath = Path("../data/processed/dollar_sales_arr.npy")
    np.save(filepath, dollar_sales_arr)
    print(f"Data written to {filepath}")

    return sales_arr, dollar_sales_arr

In [None]:
sales_arr, dollar_sales_arr = generate_Ymatrix(S, bottom_sales_arr, bottom_dollar_sales_arr)

## Create panel data

In [3]:
sales_arr = np.load("../data/processed/sales_arr.npy")
calendar = pd.read_csv("../data/calendar.csv", parse_dates=["date"])

In [4]:
def generate_panel_df(sales_arr, calendar):
    y_df_list = []
    for i in tqdm(range(sales_arr.shape[0])):
        y = sales_arr[i, :]
        y_df = pd.DataFrame(y, index=range(len(y)), columns=["y"])
        y_df = pd.merge(y_df, calendar["date"], left_index=True, right_index=True, how="left")
        y_df.insert(0, "unique_id", i)
        y_df_list.append(y_df)
    panel_df = pd.concat(y_df_list)
    panel_df = panel_df.rename(columns={"date": "ds"})

    filepath = Path("../data/processed/panel_df.parquet")
    panel_df.to_parquet(filepath)
    print(f"Data written to {filepath}")

    return panel_df

In [5]:
panel_df = generate_panel_df(sales_arr, calendar)

100%|██████████| 42840/42840 [00:35<00:00, 1219.13it/s]


Data written to ../data/processed/panel_df.parquet


In [6]:
panel_df.head()

Unnamed: 0,unique_id,y,ds
0,0,32631.0,2011-01-29
1,0,31749.0,2011-01-30
2,0,23783.0,2011-01-31
3,0,25412.0,2011-02-01
4,0,19146.0,2011-02-02


## Forecast

In [7]:
panel_df = pd.read_parquet("../data/processed/panel_df.parquet")

In [9]:
train_df = panel_df[panel_df.ds <= "2016-04-24"]
test_df = panel_df[panel_df.ds > "2016-04-24"]

In [11]:
train_df

Unnamed: 0,unique_id,y,ds
0,0,32631.0,2011-01-29
1,0,31749.0,2011-01-30
2,0,23783.0,2011-01-31
3,0,25412.0,2011-02-01
4,0,19146.0,2011-02-02
...,...,...,...
1908,42839,0.0,2016-04-20
1909,42839,0.0,2016-04-21
1910,42839,0.0,2016-04-22
1911,42839,0.0,2016-04-23


In [13]:
test_df

Unnamed: 0,unique_id,y,ds
1913,0,38793.0,2016-04-25
1914,0,35487.0,2016-04-26
1915,0,34445.0,2016-04-27
1916,0,34732.0,2016-04-28
1917,0,42896.0,2016-04-29
...,...,...,...
1936,42839,0.0,2016-05-18
1937,42839,2.0,2016-05-19
1938,42839,2.0,2016-05-20
1939,42839,5.0,2016-05-21


In [14]:
def generate_forecasts(panel_df, models):
    models = [
        ETS(season_length=7, model='ZZA'),
        AutoARIMA()
    ]

    model = StatsForecast(
        df=panel_df, 
        models=models,
        freq='D', 
        n_jobs=-1,
    )

    fcst_df = model.forecast(28).reset_index()

    filepath = "../fcst/fcst_df.csv"
    fcst_df.to_csv(filepath)
    print(f"Data written to {filepath}")
    return fcst_df

In [17]:
models = [
    ETS(season_length=7, model='ZZA'),
    AutoARIMA()
]

fcst_df = generate_forecasts(train_df, models)
fcst_df.reset_index()

In [None]:
i = 0
test_df.loc[test_df.unique_id == i].set_index("ds")["y"].plot()
fcst_df.loc[fcst_df.unique_id == i].set_index("ds")["ETS"].plot()
fcst_df.loc[fcst_df.unique_id == i].set_index("ds")["AutoARIMA"].plot()