In [1]:
# https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/
#         deepar_synthetic/deepar_synthetic.html

In [2]:
import time
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
# import boto3
# import sagemaker
# from sagemaker import get_execution_role

In [3]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.palettes import d3
output_notebook()

In [4]:
np.random.seed(1)

In [5]:
# generating synthetic data

In [6]:
freq = "H"
prediction_length = 48
context_length = 72

For this notebook, we will generate 200 noisy time series, each consisting of 400 data points and with seasonality of 24 hours. In our dummy example, all time series start at the same time point t0. When preparing your data, it is important to use the correct start point for each time series, because the model uses the time-point as a frame of reference, which enables it to learn e.g. that weekdays behave differently from weekends.

In [7]:
t0 = "2016-01-01 00:00:00"
data_length = 400
num_ts = 200
period = 24

In [8]:
time_series = []
for k in range(num_ts):
    level = 10 * np.random.rand()
    seas_amplitude = (0.1 + 0.3 * np.random.rand()) * level
    sig = 0.05 * level  # noise parameter (constant in time)
    time_ticks = np.array(range(data_length))
    source = level + seas_amplitude * np.sin(time_ticks * (2 * np.pi) / period)
    noise = sig * np.random.randn(data_length)
    data = source + noise
    index = pd.date_range(start=t0, freq=freq, periods=data_length)
    time_series.append(pd.Series(data=data, index=index))

In [9]:
plots = dict()

In [10]:
label = 'synthetic'

plots[label] = figure(
    x_axis_type='datetime',
    plot_width=960,
    plot_height=400,
    title='DeepAR basic demo ({}).'.format(label))

plots[label].grid.grid_line_alpha=0.3

plots[label].xaxis.axis_label = 'Date'
plots[label].yaxis.axis_label = 'Value'

# plot a given number of time series
for index, ts in enumerate(time_series[:10]):
    plots[label].line(ts.index,
                      ts.values,
                      color=d3['Category10'][10][index],
                      legend_label='{:03d}'.format(index))

# uncomment the following two lines to save plot
# output_file('/home/developer/gcp/cbidmltsf/datasets/cfe/{}_H_kw.html'.format(device))
# save(fig_kw)

# uncomment the following line to display plot
show(plots[label])

In this example, we will leave out the last section of each of the time series we just generated and use only the first part as training data. Here we will predict 48 data points, therefore we take out the trailing 48 points from each time series to define the training set. The test set contains the full range of each time series.

In [11]:
time_series_training = []
for ts in time_series:
    time_series_training.append(ts[:-prediction_length])

In [12]:
len(time_series[0]), len(time_series_training[0])

(400, 352)

The following utility functions convert pandas.Series objects into the appropriate JSON strings that DeepAR can consume. We will use these to write the data to S3.

In [13]:
def series_to_obj(ts, cat=None):
    obj = {"start": str(ts.index[0]), "target": list(ts)}
    if cat is not None:
        obj["cat"] = cat
    return obj

In [14]:
def series_to_jsonline(ts, cat=None):
    return json.dumps(series_to_obj(ts, cat))

In [15]:
encoding = "utf-8"
FILE_TRAIN = "/home/developer/gcp/cbidmltsf/datasets/deepar_synthetic/train.json"
FILE_TEST = "/home/developer/gcp/cbidmltsf/datasets/deepar_synthetic/test.json"

In [16]:
with open(FILE_TRAIN, "wb") as f:
    for ts in time_series_training:
        f.write(series_to_jsonline(ts).encode(encoding))
        f.write("\n".encode(encoding))

In [17]:
with open(FILE_TEST, "wb") as f:
    for ts in time_series:
        f.write(series_to_jsonline(ts).encode(encoding))
        f.write("\n".encode(encoding))

If you provide the test data channel, as we do in this example, DeepAR will also calculate accuracy metrics for the trained model on this test data set. This is done by predicting the last prediction_length points of each time series in the test set and comparing this to the actual value of the time series. The computed error metrics will be included as part of the log output.

In [18]:
num_test_windows = 4

In [19]:
for k in range(1, num_test_windows + 1):
    print(k)

1
2
3
4
