## Data Processing and Visualization

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

In [None]:
train_df = pd.read_csv("./train.csv")
train_df = train_df.rename({"date":"ds", "sales":"y"}, axis=1)

oil_df = pd.read_csv("oil.csv")
oil_df = oil_df.rename({"date":"ds"}, axis=1)

stores_df = pd.read_csv("stores.csv")
stores_dict = stores_df.set_index("store_nbr").to_dict("index")
holidays_df = pd.read_csv("holidays_events.csv")
test_df = pd.read_csv("test.csv")
test_df = test_df.rename({"date":"ds"}, axis=1)

### Interpolate oil_df

In [None]:
blank_oil_df = pd.DataFrame({"ds":pd.date_range(train_df["ds"].min(), test_df["ds"].max()).astype("str")})
oil_df = blank_oil_df.merge(oil_df, how="left", on="ds")
oil_df["dcoilwtico"] = oil_df["dcoilwtico"].interpolate("nearest")
oil_df.iloc[0, 1] = 93.14

### Process Holidays DF

In [None]:
nth_df = holidays_df[holidays_df["transferred"] == False]
th_df = holidays_df[holidays_df["type"] == "Transfer"]
th_df["description"] = th_df["description"].str.removeprefix("Traslado ")
all_holidays_df = pd.concat([nth_df, th_df], axis=0)[["date", "locale_name", "description"]]
all_holidays_df = all_holidays_df.rename({"date":"ds", "description":"holiday"}, axis=1)
all_holidays_df["lower_window"] = 0
all_holidays_df["upper_window"] = 1

### Add Additional Regressors

In [None]:
train_df = train_df.merge(oil_df, how="left", on="ds")
test_df = test_df.merge(oil_df, how="left", on="ds")

# info cols
train_df = train_df.merge(stores_df[["store_nbr", "cluster"]], how="left", on="store_nbr")
test_df = test_df.merge(stores_df[["store_nbr", "cluster"]], how="left", on="store_nbr")

In [None]:
days_counts = train_df.groupby(["store_nbr", "family"])["ds"].agg("count")

stores = pd.unique(train_df["store_nbr"])
families = pd.unique(train_df["family"])

promotion_counts = train_df[["id", "onpromotion"]].groupby("onpromotion").agg("count")
num_promotions = promotion_counts.shape[0]

states = pd.unique(stores_df["state"])
cities = pd.unique(stores_df["city"])
types = pd.unique(stores_df["type"])
cities_per_state = stores_df[["state", "city"]].drop_duplicates().groupby("state").agg("count")
stores_per_city = stores_df[["city", "store_nbr"]].drop_duplicates().groupby("city").agg("count")
stores_per_cluster = stores_df[["cluster", "store_nbr"]].drop_duplicates().groupby("cluster").agg("count")

subset as necessary for implementation and debugging

In [None]:
train_df = train_df[train_df["store_nbr"] < 4]
test_df = test_df[test_df["store_nbr"] < 4]

### Prophet: one model per store, family

In [None]:
def msle(preds_df):
    return np.mean((np.log(1 + preds_df["y"].values) - np.log(1 + preds_df["yhat"].values))**2)

In [None]:
def fit_bottom_up(key_cols, train_df, stores_dict, all_holidays_df):
    def fit(x_df):
        store_nbrs = x_df["store_nbr"].drop_duplicates()
        states = [stores_dict[snbr]["state"] for snbr in store_nbrs]
        cities = [stores_dict[snbr]["city"] for snbr in store_nbrs]
        filter = (all_holidays_df["locale_name"] == "Ecuador")
        for s in states:
            filter = filter | (all_holidays_df["locale_name"] == s)
        for c in cities:
            filter = filter | (all_holidays_df["locale_name"] == c)
        h_df = all_holidays_df[filter]
        h_df = h_df[["ds", "holiday", "lower_window", "upper_window"]]

        x_df = x_df.groupby("ds").agg({"y":"sum", "onpromotion":"sum", "dcoilwtico":"first"}).reset_index()

        model = Prophet(uncertainty_samples=0, holidays=h_df)
        model.add_regressor("onpromotion")
        model.add_regressor("dcoilwtico")

        model.fit(x_df)
        return model
    
    return train_df.groupby(key_cols).apply(fit).reset_index()

def predict_bottom_up(test_df, models_df):
    key_cols = models_df.columns[:-1].to_list()
    def predict(x_df):
        filter = pd.Series(models_df.shape[0] * [True])
        for k in key_cols:
            v = x_df[k].iloc[0]
            filter = filter & (models_df[k] == v)
        model = models_df[filter].iloc[0, -1]

        x_df = x_df.groupby("ds").agg({"onpromotion":"sum", "dcoilwtico":"first"}).reset_index()

        return model.predict(x_df)

    return test_df.groupby(key_cols).apply(predict).reset_index()

def all_cross_validation(key_cols, train_df, stores_dict, all_holidays_df):
    def cv(x_df):
        store_nbrs = x_df["store_nbr"].drop_duplicates()
        states = [stores_dict[snbr]["state"] for snbr in store_nbrs]
        cities = [stores_dict[snbr]["city"] for snbr in store_nbrs]
        filter = (all_holidays_df["locale_name"] == "Ecuador")
        for s in states:
            filter = filter | (all_holidays_df["locale_name"] == s)
        for c in cities:
            filter = filter | (all_holidays_df["locale_name"] == c)
        h_df = all_holidays_df[filter]
        h_df = h_df[["ds", "holiday", "lower_window", "upper_window"]]

        x_df = x_df.groupby("ds").agg({"y":"sum", "onpromotion":"sum", "dcoilwtico":"first"}).reset_index()


        model = Prophet(uncertainty_samples=0, holidays=h_df)
        model.add_regressor("onpromotion")
        model.add_regressor("dcoilwtico")
        model.fit(x_df)
        cv_df = cross_validation(model, initial='1460 days', period='56 days', horizon='16 days')
        cv_df["yhat"] = cv_df["yhat"].clip(lower=0)
        return cv_df.groupby("cutoff").apply(msle).reset_index()

    msles_df = train_df.groupby(key_cols).apply(cv).reset_index().rename({0:"msle"}, axis=1)
    return np.mean(np.sqrt(msles_df.groupby("cutoff")["msle"].agg("mean").values))


## Down-Aggregation

In [None]:
def get_proportions(key_cols, next_cols, x_df):
    x_df = x_df.groupby(key_cols + next_cols + ["ds"]).agg({"y":"sum", "onpromotion":"sum", "dcoilwtico":"first"}).reset_index()
    agg_x_df = x_df.groupby(key_cols + ["ds"]).agg({"y":"sum"}).reset_index().rename({"y":"agg_y"}, axis=1)
    x_df = x_df.merge(agg_x_df, how="left", on=key_cols + ["ds"])
    x_df["prop"] = x_df.loc[:, ["y"]].where(x_df["agg_y"] <= 0, x_df["y"] / x_df["agg_y"], axis=0)
    return x_df.drop(["y", "agg_y"], axis=1)

class VARModel():
    def __init__(self, lag, next_cols):
        self.lag = lag
        self.next_cols = next_cols
    
    def fit(self, x_df, lmbda):
        px_df = x_df.pivot(columns=self.next_cols, index="ds", values="prop").reset_index().sort_values("ds")
        oil_df = x_df[["ds", "dcoilwtico"]].drop_duplicates().sort_values("ds")
        # need to use loop to build design matrix
        design_cols = []
        for l in range(self.lag):
            dc = px_df.iloc[l:(px_df.shape[0] - self.lag + l), :].values.flatten()
            design_cols.append(dc)
        design_cols.append(oil_df["dcoilwtico"]).values[(self.lag):]
        X = np.stack(design_cols, axis=1)
        y = px_df.iloc[self.lag:, :].values.flatten()
        self.beta = lin_reg(X, y, lmbda)
    
    def predict(self, agg_x_df, days):
        x = agg_x_df.sort_values("ds").values[-self.lag:]
        out = []
        for i in range(days):
            y = np.concatenate([torch.ones((1, )), x]) @ self.beta
            out.append(y)
            x = np.concatenate([x[1:], np.array([y])])
        ds = pd.date_range(start=agg_x_df["ds"][-1], periods=days, freq="D", inclusive="neither")
        preds_df = pd.DataFrame({"ds":ds, "yhat":out})
        return preds_df


def lin_reg(X, y, lmbda):
    X = np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)
    beta = np.linalg.solve(X.T @ X + lmbda * np.eye(X.shape[1]), X.T @ y)
    return beta

In [None]:
cluster = 3
key_cols = ["cluster"]
next_cols = ["family"]
lag = 5
prop_df = get_proportions(key_cols, next_cols, train_df)
x_df = prop_df[prop_df["cluster"] == cluster].drop("cluster")

model = VARModel(lag, next_cols)
model.fit(x_df, lmbda=0)


In [None]:
key_cols = ["cluster", "family"]
next_cols = ["family"]

In [None]:
prop_df = get_proportions(key_cols, next_cols, train_df)

In [None]:
key_cols = ["store_nbr", "family"]
models_df = fit_bottom_up(key_cols, train_df, stores_dict, all_holidays_df)
preds_df = predict_bottom_up(test_df, models_df)

In [None]:
err = all_cross_validation(key_cols, train_df, stores_dict, all_holidays_df)

In [None]:
err