In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0


In [1]:
### For Colab, install dependencies.

!pip install mxnet-cu110
!pip install gluonts
!pip install fredapi
!pip install stats-can
!pip install --upgrade scikit-learn

Collecting mxnet-cu110
  Downloading mxnet_cu110-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (323.5 MB)
[K     |████████████████████████████████| 323.5 MB 221 bytes/s 
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet-cu110
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-cu110-1.8.0.post0
Collecting gluonts
  Downloading gluonts-0.8.1-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 5.1 MB/s 
Collecting pydantic~=1.1
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 30.6 MB/s 
[?25hCollecting typing-extensions~=3.10.0.0
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Installing collected packages: typing-exten

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab Notebooks/foodprice-forecasting
!pwd

/content/drive/MyDrive/Colab Notebooks/foodprice-forecasting
/content/drive/MyDrive/Colab Notebooks/foodprice-forecasting


In [4]:
import pandas as pd
pd.set_option('precision', 3)
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

import numpy as np
import pickle
import data

import importlib
importlib.reload(data)

from data import update_expl_data, update_target_data, food_categories, preprocess_expl
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [5]:
"""
Set sample rate. In this notebook, all data will be resampled at the chosen frequency.
'MS' : Monthly (Month Start)
'W' : Weekly
'D' : Daily
"""

year_period = {'MS': 12, 'W': 52, 'D': 365}
frequency = 'MS'
one_year = year_period[frequency]
output_path = "/content/drive/MyDrive/Colab Notebooks/foodprice-forecasting/output/nbeats_202110"

## Load Data Using APIs

In [6]:
"""
Load food CPI data from January 1986 to the most recently available data.
"""

foodprice_df = update_target_data(food_categories, './data_files/food_cpi.csv')
foodprice_df = foodprice_df.resample(frequency).mean().interpolate()
foodprice_df

Unnamed: 0_level_0,Bakery and cereal products (excluding baby food),Dairy products and eggs,"Fish, seafood and other marine products",Food purchased from restaurants,Food,"Fruit, fruit preparations and nuts",Meat,Other food products and non-alcoholic beverages,Vegetables and vegetable preparations
REF_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1986-01-01,69.3,70.9,60.6,59.1,67.3,76.0,65.1,77.5,76.0
1986-02-01,70.3,70.8,61.3,59.1,66.9,77.6,64.2,78.1,68.4
1986-03-01,70.6,71.1,61.3,59.3,67.0,79.2,64.2,78.6,66.2
1986-04-01,71.3,71.0,61.4,59.7,67.7,82.2,63.6,79.5,71.1
1986-05-01,71.2,71.4,61.9,59.9,68.2,83.5,64.0,79.8,75.3
...,...,...,...,...,...,...,...,...,...
2021-04-01,156.2,146.1,145.1,163.2,155.4,141.9,173.5,140.5,151.1
2021-05-01,157.8,146.6,147.6,163.5,156.6,143.9,175.4,141.6,153.8
2021-06-01,157.7,145.3,146.2,163.9,156.8,144.5,176.7,142.2,153.4
2021-07-01,157.9,146.4,146.6,165.2,157.6,141.7,180.9,141.9,154.8


In [7]:
"""
Load exogenous/auxiliary explanatory variables from FRED: https://fred.stlouisfed.org/
These data sources reflect various economic factors that may improve forecasts. 
Please visit the FRED website to learn more about these series, and to find others
that may be useful for food CPI forecasting. 
"""

data_sources = ["DEXCAUS",
                "DCOILWTICO",
                "WILL5000IND",
                "VXOCLS",
                "CUSR0000SAF112",
                "CUSR0000SAF113",
                "CPIFABSL",
                "UNRATE",
                "FEDFUNDS",
                "IRLTLT01CAM156N",
                "LRUNTTTTCAM156S",
                "CPALCY01CAM661N",
                "CPGRLE01CAM657N",
                "QCAR368BIS"
               ]
expl_df = preprocess_expl(update_expl_data(data_sources, './data_files/expl_vars.csv'))
expl_df_monthly = expl_df.resample(frequency).mean().interpolate()
expl_df_monthly



Unnamed: 0,DEXCAUS,DCOILWTICO,WILL5000IND,VXOCLS,CUSR0000SAF112,CUSR0000SAF113,CPIFABSL,UNRATE,FEDFUNDS,IRLTLT01CAM156N,LRUNTTTTCAM156S,CPALCY01CAM661N,CPGRLE01CAM657N,QCAR368BIS
1986-01-01,1.392,11.130,5.530,24.700,102.500,110.400,107.500,6.700,8.140,10.042,10.000,52.285,0.188,8.758
1986-02-01,1.392,11.130,5.530,24.700,102.000,105.300,107.400,7.200,7.860,9.967,10.000,52.526,0.718,9.852
1986-03-01,1.392,11.130,5.530,24.700,101.900,105.900,107.600,7.200,7.480,9.402,9.900,52.846,0.758,10.946
1986-04-01,1.392,11.130,5.530,24.700,101.100,107.800,107.800,7.100,6.990,8.848,9.800,53.006,0.515,12.040
1986-05-01,1.377,13.800,5.540,21.920,101.200,110.100,108.200,7.200,6.850,8.932,9.700,53.326,0.840,12.410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-01,1.222,71.378,211.169,14.940,282.224,316.319,275.897,5.733,0.082,1.360,7.658,110.923,0.303,10.759
2021-07-01,1.253,72.463,216.534,15.375,285.300,315.210,277.394,5.383,0.087,1.250,7.292,111.484,0.219,10.759
2021-08-01,1.259,67.943,220.541,15.080,289.882,315.585,279.340,5.009,0.085,1.226,7.004,111.777,0.176,10.759
2021-09-01,1.267,71.522,221.021,17.492,293.126,316.626,280.608,4.800,0.080,1.263,6.900,111.777,0.176,10.759


In [8]:
combined_df = pd.concat((foodprice_df, expl_df_monthly), axis=1).dropna(axis=0)
combined_df

Unnamed: 0_level_0,Bakery and cereal products (excluding baby food),Dairy products and eggs,"Fish, seafood and other marine products",Food purchased from restaurants,Food,"Fruit, fruit preparations and nuts",Meat,Other food products and non-alcoholic beverages,Vegetables and vegetable preparations,DEXCAUS,DCOILWTICO,WILL5000IND,VXOCLS,CUSR0000SAF112,CUSR0000SAF113,CPIFABSL,UNRATE,FEDFUNDS,IRLTLT01CAM156N,LRUNTTTTCAM156S,CPALCY01CAM661N,CPGRLE01CAM657N,QCAR368BIS
REF_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1986-01-01,69.3,70.9,60.6,59.1,67.3,76.0,65.1,77.5,76.0,1.392,11.130,5.530,24.700,102.500,110.400,107.500,6.700,8.140,10.042,10.000,52.285,0.188,8.758
1986-02-01,70.3,70.8,61.3,59.1,66.9,77.6,64.2,78.1,68.4,1.392,11.130,5.530,24.700,102.000,105.300,107.400,7.200,7.860,9.967,10.000,52.526,0.718,9.852
1986-03-01,70.6,71.1,61.3,59.3,67.0,79.2,64.2,78.6,66.2,1.392,11.130,5.530,24.700,101.900,105.900,107.600,7.200,7.480,9.402,9.900,52.846,0.758,10.946
1986-04-01,71.3,71.0,61.4,59.7,67.7,82.2,63.6,79.5,71.1,1.392,11.130,5.530,24.700,101.100,107.800,107.800,7.100,6.990,8.848,9.800,53.006,0.515,12.040
1986-05-01,71.2,71.4,61.9,59.9,68.2,83.5,64.0,79.8,75.3,1.377,13.800,5.540,21.920,101.200,110.100,108.200,7.200,6.850,8.932,9.700,53.326,0.840,12.410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-01,156.2,146.1,145.1,163.2,155.4,141.9,173.5,140.5,151.1,1.249,61.642,205.603,16.385,272.096,314.461,272.500,5.957,0.065,1.520,8.148,109.802,0.470,10.759
2021-05-01,157.8,146.6,147.6,163.5,156.6,143.9,175.4,141.6,153.8,1.213,65.201,206.708,18.407,277.204,315.636,274.106,5.848,0.070,1.471,8.025,110.363,0.386,10.759
2021-06-01,157.7,145.3,146.2,163.9,156.8,144.5,176.7,142.2,153.4,1.222,71.378,211.169,14.940,282.224,316.319,275.897,5.733,0.082,1.360,7.658,110.923,0.303,10.759
2021-07-01,157.9,146.4,146.6,165.2,157.6,141.7,180.9,141.9,154.8,1.253,72.463,216.534,15.375,285.300,315.210,277.394,5.383,0.087,1.250,7.292,111.484,0.219,10.759


In [9]:
foodprice_df = combined_df  # being lazy for now...

# NBEATS Model and Experiments

## Data Splitting

For each such candidate forecast, we should record any uncertainty/confidence metrics it provides, and evaluation metrics for that same model configuration over the test set. i.e. When model configuration XYZ was used to forecast Meat prices over the test set (with that data not being used for training or validation!) - what were its evaluation metrics on the withheld data? We should report this consistently for ALL EXPERIMENTS. 

For all models, we will use the following "simulated" report dates. This is a form of cross validation over time. We train a model up to each cutoff date, and then produce and evaluate 18-month forecasts. We can then collect each model's validation metric, take the mean, and use this to do model selection for the final forecast (or ensemble of forecasts!).

In [10]:
report_sim_dates = ["2015-07-01", "2016-07-01", "2018-07-01", "2019-07-01", "2020-07-01"]

In [11]:
sim_train_dates = {}
sim_valid_dates = {}

for date in report_sim_dates:
    sim_train_dates[date] = foodprice_df.index[foodprice_df.index <= date]
    sim_valid_dates[date] = foodprice_df.index[(foodprice_df.index > date) & (foodprice_df.index <= (pd.to_datetime(date) + pd.DateOffset(months=18)))]

## Fitting and Evaluating a Single NBEATS Model: Example Using All Food Prices

In [12]:
N = foodprice_df.shape[1]
T = foodprice_df.shape[0]
prediction_length = 18
freq = "MS"
dataset = foodprice_df.T.values
start = pd.Timestamp("2016-07-01", freq=freq)

In [13]:
from gluonts.dataset.common import ListDataset

# train dataset: cut the last window of length "prediction_length", add "target" and "start" fields
train_ds = ListDataset(
    [{'target': x, 'start': start} for x in dataset[:, :-prediction_length]],
    freq=freq
)

  "Using `json`-module for json-handling. "


In [14]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

metrics = {
    'r2_score': r2_score,
    'mae': mean_absolute_error,
    'mape': mean_absolute_percentage_error,
    'mse': mean_squared_error,
    'rmse': rmse
}
def get_prophet_df(foodprice_df, food_category, dates):
    df = foodprice_df[food_category][dates]
    df = df.reset_index()
    df = df.rename({'REF_DATE':'ds', food_category:'y'}, axis=1)
    return df

In [20]:
import mxnet as mx
from gluonts.model.n_beats import NBEATSEnsembleEstimator
from gluonts.mx import Trainer

estimator = NBEATSEnsembleEstimator(
    prediction_length=prediction_length,
    #context_length=7*prediction_length,
    meta_bagging_size = 3,  # 3, ## Change back to 10 after testing??
    meta_context_length = [prediction_length * mlp for mlp in [3,5,7] ], ## Change back to (2,7) // 3,5,7
    meta_loss_function = ['sMAPE'], ## Change back to all three MAPE, MASE ...
    num_stacks = 30,
    widths= [512],
    freq="MS",
    trainer=Trainer(
                # learning_rate=6e-4,
                #clip_gradient=1.0,
                # epochs=10,
                # num_batches_per_epoch=1000,
                # batch_size=16,
                ctx=mx.context.gpu()
            )

)

TRAINER:gluonts.mx.trainer._base.Trainer(add_default_callbacks=True, batch_size=None, callbacks=None, clip_gradient=10.0, ctx=mxnet.context.Context("gpu", 0), epochs=100, hybridize=True, init="xavier", learning_rate=0.001, learning_rate_decay_factor=0.5, minimum_learning_rate=5e-05, num_batches_per_epoch=50, patience=10, weight_decay=1e-08)


In [None]:
predictor = estimator.train(train_ds)

100%|██████████| 50/50 [00:04<00:00, 10.72it/s, epoch=1/100, avg_epoch_loss=2.47]
100%|██████████| 50/50 [00:03<00:00, 12.62it/s, epoch=2/100, avg_epoch_loss=1.45]
100%|██████████| 50/50 [00:04<00:00, 12.35it/s, epoch=3/100, avg_epoch_loss=1.38]
100%|██████████| 50/50 [00:04<00:00, 12.36it/s, epoch=4/100, avg_epoch_loss=1.3]
100%|██████████| 50/50 [00:03<00:00, 12.55it/s, epoch=5/100, avg_epoch_loss=1.12]
100%|██████████| 50/50 [00:03<00:00, 12.67it/s, epoch=6/100, avg_epoch_loss=1.15]
100%|██████████| 50/50 [00:03<00:00, 12.90it/s, epoch=7/100, avg_epoch_loss=1.03]
100%|██████████| 50/50 [00:03<00:00, 12.77it/s, epoch=8/100, avg_epoch_loss=1.13]
100%|██████████| 50/50 [00:04<00:00, 12.30it/s, epoch=9/100, avg_epoch_loss=1.15]
100%|██████████| 50/50 [00:03<00:00, 12.62it/s, epoch=10/100, avg_epoch_loss=1.12]
100%|██████████| 50/50 [00:03<00:00, 12.68it/s, epoch=11/100, avg_epoch_loss=1.07]
100%|██████████| 50/50 [00:03<00:00, 12.67it/s, epoch=12/100, avg_epoch_loss=0.958]
100%|████████

In [None]:
from gluonts.evaluation import make_evaluation_predictions

dataset_df = foodprice_df.T

all_valid_metrics = {}

for report_sim_date in report_sim_dates:

    report_train_dates = sim_train_dates[report_sim_date]
    report_valid_dates = sim_valid_dates[report_sim_date]

    valid_ds_report = ListDataset(
        [{'target': x, 'start': start} for x in dataset_df[list(report_train_dates) + list(report_valid_dates)].values],
        freq='MS'
    )

    forecast_it, ts_it = make_evaluation_predictions(
        dataset=valid_ds_report,  # test dataset
        predictor=predictor,  # predictor
    )

    forecasts = list(forecast_it)
    tss = list(ts_it)
    all_fc_dates = list(report_train_dates) + list(report_valid_dates)

    all_food_metrics = {}

    for target_index in range(len(forecasts)):

        # Get food price category
        foodprice_category = foodprice_df.columns[target_index]

        # plot actual
        fig, ax = plt.subplots(figsize=(8,3))
        ax.plot(all_fc_dates, foodprice_df[foodprice_category][all_fc_dates])

        # plot forecast
        forecast_entry = forecasts[target_index]
        ax.plot(report_valid_dates, forecast_entry.mean[:len(report_valid_dates)], color='green')

        plt.title(f"{foodprice_category}, {report_sim_date}")
        plt.grid()
        plt.show()

        fc_metrics = pd.Series({metric_name: metric_fn(y_true=foodprice_df[foodprice_category][report_valid_dates], y_pred=forecast_entry.mean[:len(report_valid_dates)]) for metric_name, metric_fn in metrics.items()})
        print(fc_metrics)

        all_food_metrics[foodprice_category] = fc_metrics

        # print(valid_df)
        # print(valid_forecast)

    all_valid_metrics[report_sim_date] = all_food_metrics

In [None]:
valid_metrics_concat = {}

all_valid_metrics.keys()

for report_date, valid_scores in all_valid_metrics.items():
    valid_metrics_concat[report_date] = pd.DataFrame(valid_scores).T
index = valid_metrics_concat[report_date].index
columns = valid_metrics_concat[report_date].columns
scores = [df.values for date, df in valid_metrics_concat.items()]
mean_scores = pd.DataFrame(np.array(scores).mean(axis=0), index=index, columns=columns)
mean_scores

## Fit Models Using All Data To Produce Final Forecast

## Predicted Change in CPI By Category

For the report, we usually express forecasts as the predicted percentage change, overall for the next year. We can do this by comparing the mean forecasted CPI for 2022 to the mean (known and predicted) values for 2021.