## Data Processing and Visualization

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [2]:
train_df = pd.read_csv("./train.csv")
train_df = train_df.rename({"date":"ds", "sales":"y"}, axis=1)

oil_df = pd.read_csv("oil.csv")
oil_df = oil_df.rename({"date":"ds"}, axis=1)

stores_df = pd.read_csv("stores.csv")
stores_dict = stores_df.set_index("store_nbr").to_dict("index")
holidays_df = pd.read_csv("holidays_events.csv")
test_df = pd.read_csv("test.csv")
test_df = test_df.rename({"date":"ds"}, axis=1)

### Interpolate oil_df

In [3]:
blank_oil_df = pd.DataFrame({"ds":pd.date_range(train_df["ds"].min(), test_df["ds"].max()).astype("str")})
oil_df = blank_oil_df.merge(oil_df, how="left", on="ds")
oil_df["dcoilwtico"] = oil_df["dcoilwtico"].interpolate("nearest")
oil_df.iloc[0, 1] = 93.14

### Process Holidays DF

In [4]:
nth_df = holidays_df[holidays_df["transferred"] == False]
th_df = holidays_df[holidays_df["type"] == "Transfer"]
th_df["description"] = th_df["description"].str.removeprefix("Traslado ")
all_holidays_df = pd.concat([nth_df, th_df], axis=0)[["date", "locale_name", "description"]]
all_holidays_df = all_holidays_df.rename({"date":"ds", "description":"holiday"}, axis=1)
all_holidays_df["lower_window"] = 0
all_holidays_df["upper_window"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  th_df["description"] = th_df["description"].str.removeprefix("Traslado ")


### Add Additional Regressors

In [5]:
train_df = train_df.merge(oil_df, how="left", on="ds")
test_df = test_df.merge(oil_df, how="left", on="ds")

In [None]:
days_counts = train_df.groupby(["store_nbr", "family"])["ds"].agg("count")

stores = pd.unique(train_df["store_nbr"])
families = pd.unique(train_df["family"])

promotion_counts = train_df[["id", "onpromotion"]].groupby("onpromotion").agg("count")
num_promotions = promotion_counts.shape[0]

states = pd.unique(stores_df["state"])
cities = pd.unique(stores_df["city"])
types = pd.unique(stores_df["type"])
cities_per_state = stores_df[["state", "city"]].drop_duplicates().groupby("state").agg("count")
stores_per_city = stores_df[["city", "store_nbr"]].drop_duplicates().groupby("city").agg("count")
stores_per_cluster = stores_df[["cluster", "store_nbr"]].drop_duplicates().groupby("cluster").agg("count")

subset as necessary for implementation and debugging

In [6]:
train_df = train_df[train_df["store_nbr"] < 4]
test_df = test_df[test_df["store_nbr"] < 4]

### Prophet: one model per store, family

In [7]:
def msle(preds_df):
    return np.mean((np.log(1 + preds_df["y"].values) - np.log(1 + preds_df["yhat"].values))**2)

In [19]:
def fit_bottom_up(train_df, stores_dict, all_holidays_df):
    def fit(x_df):
        store_nbr = x_df["store_nbr"].iloc[0]
        state = stores_dict[store_nbr]["state"]
        city = stores_dict[store_nbr]["city"]
        h_df = all_holidays_df[(all_holidays_df["locale_name"] == "Ecuador") | (all_holidays_df["locale_name"] == state) | (all_holidays_df["locale_name"] == city)]
        h_df = h_df[["ds", "holiday", "lower_window", "upper_window"]]

        model = Prophet(uncertainty_samples=0, holidays=h_df)
        model.add_regressor("onpromotion")
        model.add_regressor("dcoilwtico")

        model.fit(x_df)
        return model
    
    return train_df.groupby(["store_nbr", "family"]).apply(fit).reset_index()

def predict_bottom_up(test_df, models_df):
    def predict(x_df):
        store_nbr = x_df["store_nbr"].iloc[0]
        family = x_df["family"].iloc[0]
        model = models_df[(models_df["store_nbr"] == store_nbr) & (models_df["family"] == family)].iloc[0, 2]
        dates_df = x_df[["ds", "onpromotion", "dcoilwtico"]].reset_index(drop=True)
        return model.predict(dates_df)

    return test_df.groupby(["store_nbr", "family"]).apply(predict).reset_index()

def all_cross_validation(train_df, stores_dict, all_holidays_df):
    def cv(x_df):
        store_nbr = x_df["store_nbr"].iloc[0]
        state = stores_dict[store_nbr]["state"]
        city = stores_dict[store_nbr]["city"]
        h_df = all_holidays_df[(all_holidays_df["locale_name"] == "Ecuador") | (all_holidays_df["locale_name"] == state) | (all_holidays_df["locale_name"] == city)]
        h_df = h_df[["ds", "holiday", "lower_window", "upper_window"]]

        model = Prophet(uncertainty_samples=0, holidays=h_df)
        model.add_regressor("onpromotion")
        model.add_regressor("dcoilwtico")
        model.fit(x_df)
        cv_df = cross_validation(model, initial='1460 days', period='56 days', horizon='16 days')
        cv_df["yhat"] = cv_df["yhat"].clip(lower=0)
        return cv_df.groupby("cutoff").apply(msle).reset_index()

    msles_df = train_df.groupby(["store_nbr", "family"]).apply(cv).reset_index().rename({0:"msle"}, axis=1)
    return np.mean(np.sqrt(msles_df.groupby("cutoff")["msle"].agg("mean").values))


In [None]:
models_df = fit_bottom_up(train_df, stores_dict, all_holidays_df)
preds_df = predict_bottom_up(test_df, models_df)

In [10]:
err = all_cross_validation(train_df, stores_dict, all_holidays_df)

00:38:01 - cmdstanpy - INFO - Chain [1] start processing
00:38:02 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 0/4 [00:00<?, ?it/s]00:38:04 - cmdstanpy - INFO - Chain [1] start processing
00:38:04 - cmdstanpy - INFO - Chain [1] done processing
 25%|██▌       | 1/4 [00:02<00:06,  2.03s/it]00:38:06 - cmdstanpy - INFO - Chain [1] start processing
00:38:06 - cmdstanpy - INFO - Chain [1] done processing
 50%|█████     | 2/4 [00:04<00:04,  2.01s/it]00:38:07 - cmdstanpy - INFO - Chain [1] start processing
00:38:08 - cmdstanpy - INFO - Chain [1] done processing
 75%|███████▌  | 3/4 [00:05<00:01,  1.92s/it]00:38:09 - cmdstanpy - INFO - Chain [1] start processing
00:38:10 - cmdstanpy - INFO - Chain [1] done processing
100%|██████████| 4/4 [00:07<00:00,  1.95s/it]
  return cv_df.groupby("cutoff").apply(msle).reset_index()
100%|██████████| 4/4 [00:01<00:00,  2.83it/s]
  return cv_df.groupby("cutoff").apply(msle).reset_index()
00:38:13 - cmdstanpy - INFO - Chain [1] start process

In [22]:
err

0.45440711347415136