## Data Processing and Visualization

In [1]:
from datetime import date, datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [2]:
train_df = pd.read_csv("./train.csv")
train_df = train_df.rename({"date":"ds", "sales":"y"}, axis=1)

oil_df = pd.read_csv("oil.csv")
oil_df = oil_df.rename({"date":"ds"}, axis=1)

stores_df = pd.read_csv("stores.csv")
stores_dict = stores_df.set_index("store_nbr").to_dict("index")
holidays_df = pd.read_csv("holidays_events.csv")
test_df = pd.read_csv("test.csv")
test_df = test_df.rename({"date":"ds"}, axis=1)

### Interpolate oil_df

In [3]:
blank_oil_df = pd.DataFrame({"ds":pd.date_range(train_df["ds"].min(), test_df["ds"].max()).astype("str")})
oil_df = blank_oil_df.merge(oil_df, how="left", on="ds")
oil_df["dcoilwtico"] = oil_df["dcoilwtico"].interpolate("nearest")
oil_df.iloc[0, 1] = 93.14

### Process Holidays DF

In [4]:
nth_df = holidays_df[holidays_df["transferred"] == False]
th_df = holidays_df[holidays_df["type"] == "Transfer"]
th_df["description"] = th_df["description"].str.removeprefix("Traslado ")
all_holidays_df = pd.concat([nth_df, th_df], axis=0)[["date", "locale_name", "description"]]
all_holidays_df = all_holidays_df.rename({"date":"ds", "description":"holiday"}, axis=1)
all_holidays_df["lower_window"] = 0
all_holidays_df["upper_window"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  th_df["description"] = th_df["description"].str.removeprefix("Traslado ")


### Add Additional Regressors

In [5]:
train_df = train_df.merge(oil_df, how="left", on="ds")
test_df = test_df.merge(oil_df, how="left", on="ds")

# info cols
train_df = train_df.merge(stores_df[["store_nbr", "cluster"]], how="left", on="store_nbr")
test_df = test_df.merge(stores_df[["store_nbr", "cluster"]], how="left", on="store_nbr")

In [6]:
days_counts = train_df.groupby(["store_nbr", "family"])["ds"].agg("count")

stores = pd.unique(train_df["store_nbr"])
families = pd.unique(train_df["family"])

promotion_counts = train_df[["id", "onpromotion"]].groupby("onpromotion").agg("count")
num_promotions = promotion_counts.shape[0]

states = pd.unique(stores_df["state"])
cities = pd.unique(stores_df["city"])
types = pd.unique(stores_df["type"])
cities_per_state = stores_df[["state", "city"]].drop_duplicates().groupby("state").agg("count")
stores_per_city = stores_df[["city", "store_nbr"]].drop_duplicates().groupby("city").agg("count")
stores_per_cluster = stores_df[["cluster", "store_nbr"]].drop_duplicates().groupby("cluster").agg("count")

subset as necessary for implementation and debugging

In [7]:
#train_df = train_df[train_df["store_nbr"] < 4]
#test_df = test_df[test_df["store_nbr"] < 4]

### Prophet: one model per store, family

In [8]:
def msle(preds_df):
    return np.mean((np.log(1 + preds_df["y"].values) - np.log(1 + preds_df["yhat"].values))**2)

In [9]:
def fit_bottom_up(key_cols, train_df, stores_dict, all_holidays_df):
    def fit(x_df):
        store_nbrs = x_df["store_nbr"].drop_duplicates()
        states = [stores_dict[snbr]["state"] for snbr in store_nbrs]
        cities = [stores_dict[snbr]["city"] for snbr in store_nbrs]
        filter = (all_holidays_df["locale_name"] == "Ecuador")
        for s in states:
            filter = filter | (all_holidays_df["locale_name"] == s)
        for c in cities:
            filter = filter | (all_holidays_df["locale_name"] == c)
        h_df = all_holidays_df[filter]
        h_df = h_df[["ds", "holiday", "lower_window", "upper_window"]]

        x_df = x_df.groupby("ds").agg({"y":"sum", "onpromotion":"sum", "dcoilwtico":"first"}).reset_index()

        model = Prophet(uncertainty_samples=0, holidays=h_df)
        model.add_regressor("onpromotion")
        model.add_regressor("dcoilwtico")

        model.fit(x_df)
        return model
    
    return train_df.groupby(key_cols).apply(fit).reset_index()

def predict_bottom_up(test_df, models_df):
    key_cols = models_df.columns[:-1].to_list()
    def predict(x_df):
        filter = pd.Series(models_df.shape[0] * [True])
        for k in key_cols:
            v = x_df[k].iloc[0]
            filter = filter & (models_df[k] == v)
        model = models_df[filter].iloc[0, -1]

        x_df = x_df.groupby("ds").agg({"onpromotion":"sum", "dcoilwtico":"first"}).reset_index()

        return model.predict(x_df)

    return test_df.groupby(key_cols).apply(predict).reset_index()

def all_cross_validation(key_cols, train_df, stores_dict, all_holidays_df):
    def cv(x_df):
        store_nbrs = x_df["store_nbr"].drop_duplicates()
        states = [stores_dict[snbr]["state"] for snbr in store_nbrs]
        cities = [stores_dict[snbr]["city"] for snbr in store_nbrs]
        filter = (all_holidays_df["locale_name"] == "Ecuador")
        for s in states:
            filter = filter | (all_holidays_df["locale_name"] == s)
        for c in cities:
            filter = filter | (all_holidays_df["locale_name"] == c)
        h_df = all_holidays_df[filter]
        h_df = h_df[["ds", "holiday", "lower_window", "upper_window"]]

        x_df = x_df.groupby("ds").agg({"y":"sum", "onpromotion":"sum", "dcoilwtico":"first"}).reset_index()


        model = Prophet(uncertainty_samples=0, holidays=h_df)
        model.add_regressor("onpromotion")
        model.add_regressor("dcoilwtico")
        model.fit(x_df)
        cv_df = cross_validation(model, initial='1460 days', period='56 days', horizon='16 days')
        cv_df["yhat"] = cv_df["yhat"].clip(lower=0)
        return cv_df.groupby("cutoff").apply(msle).reset_index()

    msles_df = train_df.groupby(key_cols).apply(cv).reset_index().rename({0:"msle"}, axis=1)
    return np.mean(np.sqrt(msles_df.groupby("cutoff")["msle"].agg("mean").values))


## Down-Aggregation

In [29]:
def get_proportions(key_cols, support_cols, x_df):
    x_df = x_df.groupby(key_cols + support_cols + ["ds"]).agg({"y":"sum", "onpromotion":"sum", "dcoilwtico":"first"}).reset_index()
    agg_x_df = x_df.groupby(key_cols + ["ds"]).agg({"y":"sum"}).reset_index().rename({"y":"agg_y"}, axis=1)
    x_df = x_df.merge(agg_x_df, how="left", on=key_cols + ["ds"])
    x_df["prop"] = x_df.loc[:, ["y"]].where(x_df["agg_y"] <= 0, x_df["y"] / x_df["agg_y"], axis=0)
    return x_df.drop(["y", "agg_y"], axis=1)

class VARModel():
    def __init__(self, lag, support_cols):
        self.lag = lag
        self.support_cols = support_cols
    
    def fit(self, x_df, lmbda):
        px_df = x_df.pivot(columns=self.support_cols, index="ds", values="prop").sort_index()
        oil_df = x_df[["ds", "dcoilwtico"]].drop_duplicates().sort_values("ds")
        # need to use loop to build design matrix
        design_cols = []
        for l in range(self.lag):
            dc = px_df.iloc[l:(px_df.shape[0] - self.lag + l), :].values.flatten()
            design_cols.append(dc)
        design_cols.append(np.repeat(oil_df["dcoilwtico"].values[(self.lag):], px_df.shape[1]))
        X = np.stack(design_cols, axis=1)
        y = px_df.iloc[self.lag:, :].values.flatten()
        self.beta = lin_reg(X, y, lmbda)

        self.px = px_df.values[-self.lag:, :].T
        self.d = datetime.strptime(px_df.index[-1], "%Y-%m-%d").date() + timedelta(days=1)
        self.support = px_df.columns
    
    def predict(self, test_oil_df, days):
        test_oil_df = test_oil_df[["ds", "dcoilwtico"]].drop_duplicates().set_index("ds").sort_index()
        ox = np.full((self.px.shape[0], 1), test_oil_df.loc[self.d.strftime("%Y-%m-%d"), "dcoilwtico"])
        bx = np.ones((self.px.shape[0], 1))
        x = np.concatenate([bx, self.px, ox], axis=1)
        d0 = self.d
        out = []
        for i in range(days):
            if i > 0:
                self.px = np.concatenate([self.px[:, 1:], y[:, np.newaxis]], axis=1)
                self.d = self.d + timedelta(days=1)
                ox = np.full((self.px.shape[0], 1), test_oil_df.loc[self.d.strftime("%Y-%m-%d"), "dcoilwtico"])
                bx = np.ones((self.px.shape[0], 1))
                x = np.concatenate([bx, self.px, ox], axis=1)
                
            y = x @ self.beta
            out.append(y)
        ds = pd.date_range(start=d0.strftime("%Y-%m-%d"), periods=days, freq="D", inclusive="left").repeat(self.px.shape[0])
        pdf_dict = {"ds":ds}
        if len(self.support_cols) == 1:
            pdf_dict[self.support_cols[0]] = np.tile(self.support.to_numpy(), (days, ))
        else:
            for j, sc in enumerate(self.support_cols):
                pdf_dict[sc] = np.tile(np.array([self.support[i][j] for i in range(len(self.support))]), (days, ))
            pass
        pdf_dict["yhat"] = np.concatenate(out)
        preds_df = pd.DataFrame(pdf_dict)
        return preds_df


def lin_reg(X, y, lmbda):
    X = np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)
    beta = np.linalg.solve(X.T @ X + lmbda * np.eye(X.shape[1]), X.T @ y)
    return beta

In [30]:
def fit_VAR(key_cols, support_cols, train_df):
    prop_df = get_proportions(key_cols, support_cols, train_df)

    def fit(x_df):
        model = VARModel(lag=6, support_cols=support_cols)
        model.fit(x_df, 0)
        return model
    return prop_df.groupby(key_cols).apply(fit).reset_index()


def predict_VAR(key_cols, test_df, var_models_df):
    def predict(x_df):
        days = x_df["ds"].drop_duplicates().shape[0]
        filter = pd.Series(var_models_df.shape[0] * [True])
        for k in key_cols:
            v = x_df[k].iloc[0]
            filter = filter & (var_models_df[k] == v)
        model = var_models_df[filter].iloc[0, -1]
        return model.predict(test_df, days)

    return test_df.groupby(key_cols).apply(predict).reset_index()

In [35]:
key_cols = ["cluster"]
support_cols = ["store_nbr", "family"]

In [36]:
# cluster = 4
# prop_df = get_proportions(key_cols, support_cols, train_df)
# x_df = prop_df[prop_df["cluster"] == cluster]

# model = VARModel(lag=6, support_cols=support_cols)
# model.fit(x_df, 0)
# model.predict(test_df, 16)

In [37]:
var_models_df = fit_VAR(key_cols, support_cols, train_df)
predict_VAR(key_cols, test_df, var_models_df)

  return prop_df.groupby(key_cols).apply(fit).reset_index()
  return test_df.groupby(key_cols).apply(predict).reset_index()


Unnamed: 0,cluster,level_1,ds,store_nbr,family,yhat
0,1,0,2017-08-16,24,AUTOMOTIVE,0.000162
1,1,1,2017-08-16,24,BABY CARE,0.000117
2,1,2,2017-08-16,24,BEAUTY,0.000182
3,1,3,2017-08-16,24,BEVERAGES,0.080710
4,1,4,2017-08-16,24,BOOKS,0.000117
...,...,...,...,...,...,...
28507,17,523,2017-08-31,51,POULTRY,0.033805
28508,17,524,2017-08-31,51,PREPARED FOODS,0.006683
28509,17,525,2017-08-31,51,PRODUCE,0.231753
28510,17,526,2017-08-31,51,SCHOOL AND OFFICE SUPPLIES,0.001161
