In [1]:
import os
import numpy as np
import pandas as pd
import pyarrow
import time
import json
import joblib

In [2]:
from datetime import datetime, timedelta

In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
import tensorflow as tf

In [5]:
tf.__version__

'2.4.1'

In [6]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
output_notebook()

In [7]:
# continue with the next code cell to build and persist the time series dataframes
# (normalized after splitting) that will be used to create TFRecord-SLDB

# or, ignore the following cells and jump directly to import the already created
# ts_train.pkl, ts_eval.pkl, ts_test.pkl, scaler_train_eval.save, scaler_test.save files


In [8]:
# Pandas can retrieve data from Parquet by column name
mird_columns = ['timestamp',
                'Van', 'Vbn', 'Vcn', 'Vav',
                'ia', 'ib', 'ic', 'iav',
                'kw', 'kvar', 'kwan', 'kwbn', 'kwcn', 'kvaran', 'kvarbn', 'kvarcn',
                 'f', 'fp',
                'thdvan', 'thdvbn', 'thdvcn', 'thdia', 'thdib', 'thdic',
                'desbV', 'desbI',
                'kwhE', 'kwhR', 'kvarhDel', 'kvarhrec', 'kvarhq3', 'kvarhq4']

In [9]:
SOURCE_PARQUET_PATH = '/home/developer/On_Premises/MIRD_ROOT/data/raw'

In [10]:
resolution = 'hourly'

In [11]:
device = 'CPE04115'

In [12]:
path = '{}/{}/{}.parquet'.format(SOURCE_PARQUET_PATH, resolution, device)
available_dates = os.listdir(path=path)
available_dates.sort()

start_date, end_date = available_dates[0], available_dates[-1]

print('Data is available for {} dates of {}, from {} to {}.'.format(len(available_dates),
                                                                              device,
                                                                              start_date,
                                                                              end_date))

Data is available for 1343 dates of CPE04115, from 2016-01-01 to 2019-11-07.


In [13]:
# now mark the selected date interval for the analysis
# data is lost for almost two months, starting 2018-08-10
# trim the data from 2016JAN to 2018JUL, for 31 complete months

In [14]:
# manually redefine the analysis interval
start_date, end_date = '2016-01-01', '2018-07-31'

In [15]:
# get datetimes for start and end dates
start_datetime = datetime.strptime(start_date, '%Y-%m-%d')
end_datetime = datetime.strptime(end_date, '%Y-%m-%d')

In [16]:
# how long is the datetime range for the device?
datetime_range = [start_datetime + timedelta(days=x) for x in range(0, (end_datetime - start_datetime).days + 1)]

In [17]:
print('Data is required for {} valid dates between {} and {}.'.format(len(datetime_range),
                                                                     start_date,
                                                                     end_date))

Data is required for 943 valid dates between 2016-01-01 and 2018-07-31.


In [18]:
# get a list with the required dates to complete the interval
required_dates = [str(datetime)[:10] for datetime in datetime_range]
required_dates.sort()

In [19]:
# is there any valid date missing in the acquired interval?
missing_dates = [date for date in required_dates if date not in available_dates]
missing_dates.sort()
print('Found {} required dates missing in the available dataset.'.format(len(missing_dates)))

Found 0 required dates missing in the available dataset.


In [20]:
base_df = pd.DataFrame(columns=mird_columns)

In [21]:
for date in required_dates:
    path = '{}/{}/{}.parquet/{}'.format(SOURCE_PARQUET_PATH, resolution, device, date)
    
    buffer_df = pd.read_parquet(path,
                                columns=mird_columns,
                                engine='pyarrow')

    base_df = base_df.append(buffer_df, ignore_index=True)
    
# need to change timestamp column from string to datetime
base_df['timestamp'] = pd.to_datetime(base_df['timestamp'])
# sort the data on timestamp because the order might have been lost in the previous operations
base_df = base_df.sort_values(by=['timestamp'])
# re-index data on timestamp column
base_df = base_df.set_index('timestamp')

In [22]:
# verify the base dataframe
base_df

Unnamed: 0_level_0,Van,Vbn,Vcn,Vav,ia,ib,ic,iav,kw,kvar,...,thdib,thdic,desbV,desbI,kwhE,kwhR,kvarhDel,kvarhrec,kvarhq3,kvarhq4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,7991.550000,7995.656667,7953.081667,13805.566667,102.378333,75.071367,86.018033,87.822500,2089.078333,165.990167,...,0.048200,0.060117,0.000200,0.016367,173.666667,0.0,13.333333,0.0,0.0,0.0
2016-01-01 01:00:00,7962.071667,7967.080000,7924.573333,13755.666667,96.282683,71.170117,81.418733,82.957167,1966.555000,147.476000,...,0.047983,0.060733,0.000200,0.015900,163.333333,0.0,12.166667,0.0,0.0,0.0
2016-01-01 02:00:00,7960.150000,7965.326667,7922.908333,13752.583333,90.951667,68.328600,76.391567,78.557283,1861.533333,139.629000,...,0.048767,0.061917,0.000200,0.015567,155.166667,0.0,11.833333,0.0,0.0,0.0
2016-01-01 03:00:00,7957.391667,7962.635000,7922.533333,13749.233333,84.878417,64.244167,74.124150,74.415567,1763.545000,122.566500,...,0.049867,0.062117,0.000200,0.014183,146.333333,0.0,10.000000,0.0,0.0,0.0
2016-01-01 04:00:00,7965.773333,7969.850000,7933.076667,13764.266667,79.699933,61.212633,70.431350,70.447983,1671.995000,110.991333,...,0.045950,0.059667,0.000183,0.013367,138.666667,0.0,9.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-07-31 19:00:00,7861.456667,7856.568333,7829.065000,13578.833333,134.919000,116.542500,124.206667,125.222500,2902.200000,487.758333,...,3.648333,4.278333,0.100000,7.746467,241.000000,0.0,40.000000,0.0,0.0,0.0
2018-07-31 20:00:00,7855.471667,7850.665000,7820.493333,13567.016667,134.807667,115.811500,125.180500,125.266667,2912.063333,404.165167,...,4.013333,4.640000,0.116667,7.768217,243.666667,0.0,33.666667,0.0,0.0,0.0
2018-07-31 21:00:00,7826.050000,7822.853333,7791.426667,13517.250000,139.743667,117.616333,128.172333,128.510667,2983.391667,360.991833,...,4.083333,4.708333,0.133333,8.853917,248.166667,0.0,30.000000,0.0,0.0,0.0
2018-07-31 22:00:00,7893.463333,7887.863333,7850.766667,13627.850000,132.072667,108.757667,119.247167,120.025667,2810.393333,319.759000,...,4.176667,4.943333,0.200000,10.045383,233.333333,0.0,26.333333,0.0,0.0,0.0


In [23]:
# save a version of the base dataframe in CSV format to test TFX components
# (to developer home path)

# base_df.to_csv('/home/developer/CPE04115_H.csv')

In [24]:
percentage = 0.995
slack = 1.25
ceil_kw = base_df[['kw']].quantile(percentage).kw*slack

In [25]:
fig_kw = figure(
    x_axis_type='datetime',
    y_range=(0., ceil_kw),
    plot_width=960,
    plot_height=400,
    title='Active Power (hourly) for {}.'.format(device))

fig_kw.grid.grid_line_alpha=0.3

fig_kw.xaxis.axis_label = 'Date'
fig_kw.yaxis.axis_label = 'Active Power [W]'

fig_kw.line(base_df.index, base_df.kw, color='#A6CEE3', legend_label='kw')

# uncomment the following two lines to save plot
# output_file('/home/developer/gcp/cbidmltsf/datasets/cfe/{}_H_kw.html'.format(device))
# save(fig_kw)

# uncomment the following line to display plot
show(fig_kw)

In [26]:
# there are no missing values
# identify outliers by Z-score

In [27]:
import numpy as np

In [28]:
from scipy.stats import zscore

In [29]:
preprocessed_df = base_df.copy()

In [30]:
z_threshold = [-2.0, 2.6]
low_outliers = list(zscore(base_df.kw) < z_threshold[0])
high_outliers = list(zscore(base_df.kw) > z_threshold[1])

In [31]:
outliers_list = [x or y for x, y in zip(low_outliers, high_outliers)]

In [32]:
print('Found {} outliers with absolute Z-score outside {} in {} lectures.'.format(sum(outliers_list),
                                                                                       z_threshold,
                                                                                       base_df.kw.count()))

Found 59 outliers with absolute Z-score outside [-2.0, 2.6] in 22629 lectures.


In [33]:
# a new dataframe with outliers set to None
preprocessed_df.kw[outliers_list] = None

In [34]:
fig_kw = figure(
    x_axis_type='datetime',
    y_range=(0., ceil_kw),
    plot_width=960,
    plot_height=400,
    title='Active Power (hourly) for {}.'.format(device))

fig_kw.grid.grid_line_alpha=0.3

fig_kw.xaxis.axis_label = 'Date'
fig_kw.yaxis.axis_label = 'Active Power [W]'

fig_kw.line(base_df.index, preprocessed_df.kw, color='#A6CEE3', legend_label='kw')

# uncomment the following two lines to save plot
# output_file('/home/developer/gcp/cbidmltsf/datasets/cfe/{}_H_kw.html'.format(device))
# save(fig_kw)

# uncomment the following line to display plot
show(fig_kw)

In [35]:
# simple correction of outliers
# update NaN values to
# the immediate last week value
# (or, maybe, the average of the last n week-values)

In [36]:
# a list with datetimes where kw is None
dates_to_fill = preprocessed_df.index[outliers_list]

In [37]:
# traverse all dates with a NaN in the variable of interest (kw)
for date in dates_to_fill:
    # get the timestamp for a week before
    date_minus_one_week = date - timedelta(days=7)
    # update the missing value to the one in the previous week, if the latter exists
    if preprocessed_df.loc[date_minus_one_week].kw is not None:
        preprocessed_df.loc[date].kw = preprocessed_df.loc[date_minus_one_week].kw

In [38]:
fig_kw = figure(
    x_axis_type='datetime',
    y_range=(0., ceil_kw),
    plot_width=960,
    plot_height=400,
    title='Active Power (hourly) for {}.'.format(device))

fig_kw.grid.grid_line_alpha=0.3

fig_kw.xaxis.axis_label = 'Date'
fig_kw.yaxis.axis_label = 'Active Power [W]'

fig_kw.line(base_df.index, preprocessed_df.kw, color='#A6CEE3', legend_label='kw')

# uncomment the following two lines to save plot
# output_file('/home/developer/gcp/cbidmltsf/datasets/cfe/{}_H_kw.html'.format(device))
# save(fig_kw)

# uncomment the following line to display plot
show(fig_kw)

In [39]:
# now, save the resulting time series as it is the base for further work on forecasting
# save the time series only, not the complete dataframe, as other variables have not been preprocessed
# Pandas pickle or Parquet archive? (answer: use the format required to produce SLDBs)

In [40]:
# persist the preprocessed time series to Pandas pickle
# scale it first, for data securiryt, because it will be persisted to the cloud

In [41]:
# get the time series as a copy of the corresponding dataframe column
time_series_kw = preprocessed_df.kw.copy()

In [42]:
time_series_kw.count()

22629

In [43]:
# split data set into train/eval (seen data) and test (unseen data)
# split at time series level to avoid data overlapping

# split first between seen and unseen to avoid data leakage
# (normalize seen data and unseen data after splitting)

In [44]:
# split the time series by timestamp index instead of by proportions
# review the time interval where the time series resides
time_series_kw.index[0], time_series_kw.index[-1]

(Timestamp('2016-01-01 00:00:00'), Timestamp('2018-07-31 23:00:00'))

In [45]:
# there are 31 months in the current time series
# 24 months for training dataset (seen data)
ts_train_start = '2016-01-01 00:00:00'
ts_train_end = '2017-12-31 23:00:00'

# 4 months for evaluation dataset (seen data)
ts_eval_start = '2018-01-01 00:00:00'
ts_eval_end = '2018-04-30 23:00:00'

# 3 months for test dataset (unseen data)
ts_test_start = '2018-05-01 00:00:00'
ts_test_end = '2018-07-31 23:00:00'

In [46]:
# get the time series to build the datasets seen by the model (training, evaluation)
time_series_train_eval = time_series_kw.loc[ts_train_start:ts_eval_end]

In [47]:
# get the time series to build the dataset unseen by the model (test)
time_series_test = time_series_kw.loc[ts_test_start:ts_test_end]

In [48]:
# normalize data after seen/unseen split to avoid min/max leakage from test to train_eval

In [49]:
# scale datasets to improve neural networks performance
from sklearn.preprocessing import MinMaxScaler

In [50]:
# scaler persistence
import joblib

In [51]:
# get a scaler to normalize seen data
scaler_train_eval = MinMaxScaler(feature_range=(0, 1))

In [52]:
# build the scaled time series for seen data
time_series_kw_train_eval_scaled = scaler_train_eval.fit_transform(np.array(time_series_train_eval).reshape(-1, 1))

In [53]:
time_series_kw_train_eval_scaled.shape

(20421, 1)

In [54]:
# get a scaler to normalize unseen data
scaler_test = MinMaxScaler(feature_range=(0, 1))

In [55]:
# build the scaled time series for unseen data
time_series_kw_test_scaled = scaler_test.fit_transform(np.array(time_series_test).reshape(-1, 1))

In [56]:
time_series_kw_test_scaled.shape

(2208, 1)

In [59]:
# or simply recover previously built identifier
identifier = 'CPE04115_H_kw_20210526212214'

In [57]:
# build a string identifier for the time series and its directory inside timeseries/
# add a time-based suffix to manage different versions of the same time series
identifier = '{}_{}_{}_{}'.format(device,
                                  resolution[0].upper(),
                                  'kw',
                                  time.strftime('%Y%m%d%H%M%S'))

In [58]:
# build the time series directory
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(identifier)

In [59]:
try:
    os.mkdir(time_series_folder)
    print('Directory {} was created.'.format(time_series_folder))
except FileExistsError:
    print('Error: directory {} already exists.'.format(time_series_folder))

Error: directory /home/developer/gcp/cbidmltsf/timeseries/CPE04115_H_kw_20210526212214 already exists.


In [193]:
# persist fitted scaler_seen to timeseries/identifier/
scaler_train_eval_filename = '{}/scaler_train_eval.save'.format(time_series_folder)

In [194]:
joblib.dump(scaler_train_eval, scaler_train_eval_filename)

['/home/developer/gcp/cbidmltsf/timeseries/CPE04115_H_kw_20210526212214/scaler_train_eval.save']

In [195]:
# persist fitted scaler_unseen to timeseries/identifier/
scaler_test_filename = '{}/scaler_test.save'.format(time_series_folder)

In [196]:
joblib.dump(scaler_test, scaler_test_filename)

['/home/developer/gcp/cbidmltsf/timeseries/CPE04115_H_kw_20210526212214/scaler_test.save']

In [51]:
# the scaled time series are NumPy arrays, with only values for the variable
# the arrays do not contain timestamps
# need to build new Pandas time series from the scaled one
# to add timestamp before persisting it to disk

In [60]:
time_series_kw_train_eval_scaled_df = pd.DataFrame(data=time_series_kw_train_eval_scaled,
                                                   columns=['kw_scaled'],
                                                   index=time_series_train_eval.index)

In [61]:
time_series_kw_train_eval_scaled_df

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,0.274317
2016-01-01 01:00:00,0.217363
2016-01-01 02:00:00,0.168545
2016-01-01 03:00:00,0.122996
2016-01-01 04:00:00,0.080440
...,...
2018-04-30 19:00:00,0.615922
2018-04-30 20:00:00,0.616713
2018-04-30 21:00:00,0.647489
2018-04-30 22:00:00,0.594716


In [62]:
# once the time series for train and eval have been normalized together
# separate them before being persisted

In [63]:
time_series_kw_train_scaled_df = time_series_kw_train_eval_scaled_df[ts_train_start:ts_train_end]

In [64]:
time_series_kw_train_scaled_df

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,0.274317
2016-01-01 01:00:00,0.217363
2016-01-01 02:00:00,0.168545
2016-01-01 03:00:00,0.122996
2016-01-01 04:00:00,0.080440
...,...
2017-12-31 19:00:00,0.542273
2017-12-31 20:00:00,0.478005
2017-12-31 21:00:00,0.414886
2017-12-31 22:00:00,0.358717


In [65]:
time_series_kw_eval_scaled_df = time_series_kw_train_eval_scaled_df[ts_eval_start:ts_eval_end]

In [66]:
time_series_kw_eval_scaled_df

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2018-01-01 00:00:00,0.281576
2018-01-01 01:00:00,0.225610
2018-01-01 02:00:00,0.169079
2018-01-01 03:00:00,0.133864
2018-01-01 04:00:00,0.124268
...,...
2018-04-30 19:00:00,0.615922
2018-04-30 20:00:00,0.616713
2018-04-30 21:00:00,0.647489
2018-04-30 22:00:00,0.594716


In [67]:
# now build the time series dataframe for unseen data

In [68]:
time_series_kw_test_scaled_df = pd.DataFrame(data=time_series_kw_test_scaled,
                                             columns=['kw_scaled'],
                                             index=time_series_test.index)

In [69]:
time_series_kw_test_scaled_df

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2018-05-01 00:00:00,0.277562
2018-05-01 01:00:00,0.174138
2018-05-01 02:00:00,0.114769
2018-05-01 03:00:00,0.099625
2018-05-01 04:00:00,0.080639
...,...
2018-07-31 19:00:00,0.651798
2018-07-31 20:00:00,0.656658
2018-07-31 21:00:00,0.691807
2018-07-31 22:00:00,0.606559


In [209]:
# persist the scaled time series as Pandas pickles
ts_train_pickle_filename = '{}/ts_train.pkl'.format(time_series_folder)
time_series_kw_train_scaled_df.to_pickle(ts_train_pickle_filename)

In [210]:
# persist the scaled time series as Pandas pickles
ts_eval_pickle_filename = '{}/ts_eval.pkl'.format(time_series_folder)
time_series_kw_eval_scaled_df.to_pickle(ts_eval_pickle_filename)

In [211]:
# persist the scaled time series as Pandas pickles
ts_test_pickle_filename = '{}/ts_test.pkl'.format(time_series_folder)
time_series_kw_test_scaled_df.to_pickle(ts_test_pickle_filename)

In [212]:
# verify the persisted time series dataframes

In [213]:
pd.read_pickle(ts_train_pickle_filename)

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,0.274317
2016-01-01 01:00:00,0.217363
2016-01-01 02:00:00,0.168545
2016-01-01 03:00:00,0.122996
2016-01-01 04:00:00,0.080440
...,...
2017-12-31 19:00:00,0.542273
2017-12-31 20:00:00,0.478005
2017-12-31 21:00:00,0.414886
2017-12-31 22:00:00,0.358717


In [214]:
pd.read_pickle(ts_eval_pickle_filename)

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2018-01-01 00:00:00,0.281576
2018-01-01 01:00:00,0.225610
2018-01-01 02:00:00,0.169079
2018-01-01 03:00:00,0.133864
2018-01-01 04:00:00,0.124268
...,...
2018-04-30 19:00:00,0.615922
2018-04-30 20:00:00,0.616713
2018-04-30 21:00:00,0.647489
2018-04-30 22:00:00,0.594716


In [215]:
pd.read_pickle(ts_test_pickle_filename)

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2018-05-01 00:00:00,0.277562
2018-05-01 01:00:00,0.174138
2018-05-01 02:00:00,0.114769
2018-05-01 03:00:00,0.099625
2018-05-01 04:00:00,0.080639
...,...
2018-07-31 19:00:00,0.651798
2018-07-31 20:00:00,0.656658
2018-07-31 21:00:00,0.691807
2018-07-31 22:00:00,0.606559


### skip the following steps to avoid storing the time series again

In [71]:
import json

In [72]:
# get the start and end timestamps for model stages from the corresponding variables above

In [74]:
# time series specs have to be persisted as a JSON file
ts = {
    'device': 'CPE04115',
    'resolution': 'hourly',
    'variable': 'kw',
    'train': {
        'start': '2016-01-01 00:00:00',
        'end': '2017-12-31 23:00:00'
    },
    'eval': {
        'start': '2018-01-01 00:00:00',
        'end': '2018-04-30 23:00:00'        
    },
    'test': {
        'start': '2018-05-01 00:00:00',
        'end': '2018-07-31 23:00:00'        
    },
    'identifier': identifier
}


### skip the following step to avoid storing json file again

In [None]:
json_filename = '{}/ts.json'.format(time_series_folder)

# persist time series specs for further use
with open(json_filename, 'w') as outfile:
    json.dump(ts, outfile, indent=4)

In [15]:
# once scaled time series for building train, eval, and test datasets have been persisted
# SLDB files can be produced loading the corresponding pickle files, if required

In [18]:
# files that already exist in the time series directory
# scaler_train_eval.save: scaler of the joint time series of data seen by the model
# scaler_test.save: scaler of the time series of data unseen by the model
# ts.json: description dictionary of the time series
# ts_train.pkl
# ts_eval.pkl
# ts_test.pkl: pickle files of the time series, normalized after splitting

In [19]:
# files that will be created in the SLDB directory:
# train.tfrecord
# eval.tfrecord
# test.tfrecord
# sldb.json

In [75]:
# a dictionary to configure and describe the SLDB
# add a boolean (binary) to state this SLDB is only functional to the transformer architecture

sldb = {
    'ts': 'CPE04115_H_kw_20210526212214',
    'embedding': {
        'hourly': 96
    },
    'tau': {
        'hourly': 1
    },
    'no_targets': 24,
    'BSCTRFM': 1
}

In [10]:
# time series set was built and persisted in a different code
# SLDB construction begins here

In [77]:
# load the required time series
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(sldb['ts'])

In [78]:
time_series_folder

'/home/developer/gcp/cbidmltsf/timeseries/CPE04115_H_kw_20210526212214'

In [83]:
# a dictionary to manage the time series for the different model stages
ts = dict()

In [84]:
# load scaled time series from pickle files

In [85]:
ts['train'] = pd.read_pickle('{}/ts_train.pkl'.format(time_series_folder))

In [86]:
ts['eval'] = pd.read_pickle('{}/ts_eval.pkl'.format(time_series_folder))

In [87]:
ts['test'] = pd.read_pickle('{}/ts_test.pkl'.format(time_series_folder))

In [80]:
# expand time series dataframe with six columns for sine-cosine positional encoding over hour, day, month
# ToDo: include weekday in positional encoding for later ablation
# ToDo: get the value of days_in_month according to the real date

In [81]:
# prepare sine-cosine positional encoding for the time series
hours_in_day = 24
days_in_month = 30
months_in_year = 12

In [88]:
# build positional encodings for time series in all stages
for stage in ['train', 'eval', 'test']:
    
    # build arrays with indexes hour, day, and month
    timestamp_hour = np.array(ts[stage].index.hour)
    timestamp_day = np.array(ts[stage].index.day)
    timestamp_month = np.array(ts[stage].index.month)
    
    # build arrays with positional encoding components and cast them to float32
    sin_hour = np.sin(2*np.pi*timestamp_hour/hours_in_day).astype(np.float32)
    cos_hour = np.cos(2*np.pi*timestamp_hour/hours_in_day).astype(np.float32)

    sin_day = np.sin(2*np.pi*timestamp_day/days_in_month).astype(np.float32)
    cos_day = np.cos(2*np.pi*timestamp_day/days_in_month).astype(np.float32)

    sin_month = np.sin(2*np.pi*timestamp_month/months_in_year).astype(np.float32)
    cos_month = np.cos(2*np.pi*timestamp_month/months_in_year).astype(np.float32)
    
    # now expand the time series dataframe with positional encoding components
    # pass the pos encoding arrays to dataframe as lists
    ts[stage]['sin_hour'] = list(sin_hour)
    ts[stage]['cos_hour'] = list(cos_hour)
    ts[stage]['sin_day'] = list(sin_day)
    ts[stage]['cos_day'] = list(cos_day)
    ts[stage]['sin_month'] = list(sin_month)
    ts[stage]['cos_month'] = list(cos_month)


In [89]:
# report results
for stage in ['train', 'eval', 'test']:
    print('{} lectures in {} time series from {} to {}'.format(ts[stage].count()[0],
                                                               stage,
                                                               ts[stage].index[0],
                                                               ts[stage].index[-1]))

17542 lectures in train time series from 2016-01-01 00:00:00 to 2017-12-31 23:00:00
2879 lectures in eval time series from 2018-01-01 00:00:00 to 2018-04-30 23:00:00
2208 lectures in test time series from 2018-05-01 00:00:00 to 2018-07-31 23:00:00


In [90]:
# SLDB for transformer has the following features in each row:
# source tensor: kw_scaled, sin_hour, cos_hour, sin_day, cos_day, sin_month, cos_month (?, n_timesteps, 7)
# target tensor: kw_scaled, sin_hour, cos_hour, sin_day, cos_day, sin_month, cos_month (?, n_timesteps, 7)

In [91]:
# data structure to convert to TFRecords: list of NumPy arrays
# build all the possible sub-series of sldb['embedding']['hourly'] elements (the embedding dimension)

In [92]:
m = sldb['embedding']['hourly']

In [93]:
# initialize a sub-dictionary for SLDB stats
sldb['stats'] = dict()

In [25]:
# a dictionary to store row arrays lists for all the stages
results = dict()

# iterate on stages
for stage in ['train', 'eval', 'test']:
    
    # use a temporary list for each SLDB feature (column)
    source_arrays_list = list()
    target_arrays_list = list()
    target_pos_encoding_arrays_list = list()
    source_timestamp_arrays_list = list()
    target_timestamp_arrays_list = list()
    
    # remember the row_array (former source/target tensor) has dimensionality = m + 1
    # because the source tensor is row[:m]
    # and the target tensor is row[1:]
    
    for start_value in range(ts[stage].shape[0] - m ):
        # start_value, end_value are the indexes in the dataframe that define the time window sub-series
        end_value = start_value + m + 1
        # get the time window sub-series
        sub_series = ts[stage][start_value: end_value]

        # pass the sub-series to a NumPy array of shape [m+1, n_features] V.gr. [168+1, 7]
        # discard the timestamp index when passing values
        row_array = sub_series.reset_index(drop=True).to_numpy()
        # and pass the timestamp index as string        
        timestamp_array = sub_series.index.strftime("%Y-%m-%d %H:%M:%S")        
        
        # build SLDB columns from the row array and the timestamp array
        source_arrays_list.append(row_array[:m])                     # shape is [m, n_features]
        target_arrays_list.append(row_array[1:, :1])                 # shape is [m, 1]
        target_pos_encoding_arrays_list.append(row_array[1:, 1:])    # shape is [m, 6]
        source_timestamp_arrays_list.append(timestamp_array[:m])     # shape is [m, 1]
        target_timestamp_arrays_list.append(timestamp_array[1:])     # shape is [m, 1]
        
    # report stage completion
    print('Dataset created for {} stage with {} source rows.'.format(stage,
                                                                     len(source_arrays_list)))
    
    print('Dataset created for {} stage with {} target rows.'.format(stage,
                                                                     len(target_arrays_list)))
    
    print('Dataset created for {} stage with {} target pos encoding rows.'.format(stage,
                                                                                  len(target_pos_encoding_arrays_list)))
    
    print('Dataset created for {} stage with {} total source timestamps.'.format(stage,
                                                                                 len(source_timestamp_arrays_list)))
    
    print('Dataset created for {} stage with {} total target timestamps.'.format(stage,
                                                                                 len(target_timestamp_arrays_list)))
    # given the nature of the autoregressive transformer
    # (target is source, shifted once to the right):
    # row [:1, :m] is the source and row [:1, 1:] is the target
    
    # add a sub-dictionary for stage results
    results[stage] = dict()

    results[stage]['source'] = source_arrays_list
    results[stage]['target'] = target_arrays_list
    results[stage]['target_pos_encoding'] = target_pos_encoding_arrays_list
    results[stage]['source_timestamp'] = source_timestamp_arrays_list
    results[stage]['target_timestamp'] = target_timestamp_arrays_list

    # add a sub-dictionary for stage stats
    sldb['stats'][stage] = dict()
    # pass number of rows to SLDB statistics dictionary
    
    # the number of rows in the source list (any SLDB column works to get the number of rows)
    sldb['stats'][stage]['n_rows'] = len(results[stage]['source'])

Dataset created for train stage with 17374 source rows.
Dataset created for train stage with 17374 target rows.
Dataset created for train stage with 17374 target pos encoding rows.
Dataset created for train stage with 17374 total source timestamps.
Dataset created for train stage with 17374 total target timestamps.
Dataset created for eval stage with 2711 source rows.
Dataset created for eval stage with 2711 target rows.
Dataset created for eval stage with 2711 target pos encoding rows.
Dataset created for eval stage with 2711 total source timestamps.
Dataset created for eval stage with 2711 total target timestamps.
Dataset created for test stage with 2040 source rows.
Dataset created for test stage with 2040 target rows.
Dataset created for test stage with 2040 target pos encoding rows.
Dataset created for test stage with 2040 total source timestamps.
Dataset created for test stage with 2040 total target timestamps.


In [26]:
sldb

{'ts': 'CPE04115_H_kw_20210526212214',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'ARTRFDC': 1,
 'stats': {'train': {'n_rows': 17374},
  'eval': {'n_rows': 2711},
  'test': {'n_rows': 2040}}}

In [27]:
# verify some SLDB columns before serializing to TFRecord files
results['test']['source'][0].shape

(168, 7)

In [28]:
results['test']['target'][0].shape

(168, 1)

In [29]:
results['test']['target_pos_encoding'][0].shape

(168, 6)

In [30]:
results['test']['source_timestamp'][0].shape

(168,)

In [31]:
results['test']['target_timestamp'][0].shape

(168,)

In [32]:
# use list comprehension to encode the timestamp array (Index dtype) into a list of bytes
# [timestamp.encode() for timestamp in results['test']['timestamp'][0]]

In [33]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [34]:
# a function to encode byte values for serialized examples
def _bytes_feature_from_list_of_values(list_of_values):
    """Returns a bytes_list from a list of strings / bytes."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_values))

In [35]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier
sldb_specs = 'ARTRFDC_{:03d}'.format(sldb['embedding']['hourly'])
sldb_specs

'ARTRFDC_168'

In [36]:
# get the time-based identifer for the SLDB
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'CPE04115_H_kw_20210526212214_ARTRFDC_168'

In [37]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)
sldb_dir

'/home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_ARTRFDC_168'

In [38]:
try:
    os.mkdir(sldb_dir)
    print('Directory {} was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(sldb_dir))

Error: directory /home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_ARTRFDC_168 already exists.


In [39]:
for stage in ['train', 'eval', 'test']:
    N_ROWS = sldb['stats'][stage]['n_rows']
    filename = '{}/{}.tfrecord'.format(sldb_dir, stage)

    with tf.io.TFRecordWriter(filename) as writer:
        for row in np.arange(N_ROWS):
            example = tf.train.Example(
                # features within the example
                features=tf.train.Features(
                    # individual feature definition
                    feature={'source': _float_feature_from_list_of_values(results[stage]['source'][row].flatten()),
                             'target': _float_feature_from_list_of_values(results[stage]['target'][row].flatten()),
                             'target_pos_encoding': _float_feature_from_list_of_values(results[stage]['target_pos_encoding'][row].flatten()),
                             # use list comprehension to encode the timestamp array (dtype=Index) into a list of bytes
                             'source_timestamp': _bytes_feature_from_list_of_values([timestamp.encode() for timestamp in results[stage]['source_timestamp'][row]]),
                             'target_timestamp': _bytes_feature_from_list_of_values([timestamp.encode() for timestamp in results[stage]['target_timestamp'][row]])
                             }
                )
            )
            serialized_example = example.SerializeToString()
            writer.write(serialized_example)

In [40]:
# build a path for the json file
json_filename = '{}/sldb.json'.format(sldb_dir)

In [41]:
# persist the final, compact dictionary to JSON
with open(json_filename, 'w') as filename:
    json.dump(sldb, filename, indent=4)

In [42]:
# do not forget to sync sldbs/ from local to GS after the previous operations!
!gsutil rsync -d -r /home/developer/gcp/cbidmltsf/sldbs gs://cbidmltsf/sldbs

Building synchronization state...
Starting synchronization...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_ARTRFDC_168/eval.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_ARTRFDC_168/sldb.json [Content-Type=application/json]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_ARTRFDC_168/test.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_ARTRFDC_168/train.tfrecord [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<ht