## This is the code to build SLDBs for the autoregressive transformer architecture.
## It superseedes the make_sldb.py script

In [1]:
# replace the variable 'labels' with 'targets', as the latter is more adequate for regression problems

In [2]:
import os
import numpy as np
import pandas as pd
import pyarrow
import json
import joblib

In [3]:
# scale datasets to improve neural networks performance
from sklearn.preprocessing import MinMaxScaler

In [4]:
from datetime import datetime, timedelta

In [5]:
import tensorflow as tf

In [6]:
# files in the time series directory
# scaler.save
# ts.json
# ts.pkl

In [7]:
# files in the SLDB directory:
# train.tfrecord
# eval.tfrecord
# test.tfrecord
# sldb.json

In [8]:
# a dictionary to configure the SLDB
# ToDo: transfer this dictionary to dplstm/configs/sldb_config.py

# modify the dictionary structure:
# no_targets must be the same for all components, then move it to an upper level
# remove components and use the same structure as in architecture_parameters

# ToDo: build all sldb dictionaries on the basis of list-type parameters,
#  by iterating on them to avoid comments on the non-used resolutions, like
#  m = [8, 8, 8], tau = [1, 24, 168], no_targets = [24] or
#  m = [256], tau = [1], no_targets = [24]


sldb = {
    'ts': 'CPE04115_H_kw_20201021084001',
    'embedding': {
        'hourly': 168
    },
    'tau': {
        'hourly': 1
    },
    'no_targets': 168
}

In [9]:
# time series was built and persisted in a different code
# SLDB constructions begins here

In [11]:
# load the required time series
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(sldb['ts'])
pickle_filename = '{}/ts.pkl'.format(time_series_folder)
ts_df = pd.read_pickle(pickle_filename)

In [12]:
ts_df

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,0.274317
2016-01-01 01:00:00,0.217363
2016-01-01 02:00:00,0.168545
2016-01-01 03:00:00,0.122996
2016-01-01 04:00:00,0.080440
...,...
2018-07-31 19:00:00,0.652287
2018-07-31 20:00:00,0.656872
2018-07-31 21:00:00,0.690028
2018-07-31 22:00:00,0.609612


In [13]:
# expand time series dataframe with six columns for sine-cosine positional encoding over hour, day, month

In [14]:
# prepare sine-cosine positional encoding for the time series
hours_in_day = 24
days_in_month = 30
months_in_year = 12

In [15]:
# build arrays with indexes hour, day, and month
timestamp_hour = np.array(ts_df.index.hour)
timestamp_day = np.array(ts_df.index.day)
timestamp_month = np.array(ts_df.index.month)

In [16]:
# build arrays with positional encoding components and cast them to float32
sin_hour = np.sin(2*np.pi*timestamp_hour/hours_in_day).astype(np.float32)
cos_hour = np.cos(2*np.pi*timestamp_hour/hours_in_day).astype(np.float32)

sin_day = np.sin(2*np.pi*timestamp_day/days_in_month).astype(np.float32)
cos_day = np.cos(2*np.pi*timestamp_day/days_in_month).astype(np.float32)

sin_month = np.sin(2*np.pi*timestamp_month/months_in_year).astype(np.float32)
cos_month = np.cos(2*np.pi*timestamp_month/months_in_year).astype(np.float32)

In [17]:
# now expand the time series dataframe with positional encoding components
# pass the pos encoding arrays to dataframe as lists
ts_df['sin_hour'] = list(sin_hour)
ts_df['cos_hour'] = list(cos_hour)
ts_df['sin_day'] = list(sin_day)
ts_df['cos_day'] = list(cos_day)
ts_df['sin_month'] = list(sin_month)
ts_df['cos_month'] = list(cos_month)

In [18]:
# review the final time series dataframe
ts_df.head()

Unnamed: 0_level_0,kw_scaled,sin_hour,cos_hour,sin_day,cos_day,sin_month,cos_month
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01 00:00:00,0.274317,0.0,1.0,0.207912,0.978148,0.5,0.866025
2016-01-01 01:00:00,0.217363,0.258819,0.965926,0.207912,0.978148,0.5,0.866025
2016-01-01 02:00:00,0.168545,0.5,0.866025,0.207912,0.978148,0.5,0.866025
2016-01-01 03:00:00,0.122996,0.707107,0.707107,0.207912,0.978148,0.5,0.866025
2016-01-01 04:00:00,0.08044,0.866025,0.5,0.207912,0.978148,0.5,0.866025


In [19]:
# evaluation stage is not used for TPU-based training,
# however, evaluation dataset might be useful to get stats from CPU-based training
stages = ['train', 'eval', 'test']

In [20]:
# split data set into train/eval/test at time series level
# to avoid data overlapping at SLDB level
split = np.array([0.8, 0.9, 1.0])

In [21]:
# get indexes of the scaled time series for train, validation, and test thresholds
# train_eval_limit = np.int(ts_df.count()*split[0])
# eval_test_limit = np.int(ts_df.count()*split[1])

# use the number of rows in the time series (as it has now more than a column, and count() returns a vector)
train_eval_limit = np.int(ts_df.shape[0]*split[0])
eval_test_limit = np.int(ts_df.shape[0]*split[1])

In [22]:
# a dictionary to manage the time series for the different model stages
ts = dict()

In [23]:
# get the time series portion for train set
ts['train'] = ts_df[:train_eval_limit]
print('{0} lectures in train time series from {1} to {2}'.format(ts['train'].count()[0],
                                                                 ts['train'].index[0],
                                                                 ts['train'].index[-1]))

18103 lectures in train time series from 2016-01-01 00:00:00 to 2018-01-24 08:00:00


In [24]:
# get the time series portion for eval set
ts['eval'] = ts_df[train_eval_limit:eval_test_limit]
print('{0} lectures in eval time series from {1} to {2}'.format(ts['eval'].count()[0],
                                                                ts['eval'].index[0],
                                                                ts['eval'].index[-1]))

2263 lectures in eval time series from 2018-01-24 09:00:00 to 2018-04-28 16:00:00


In [25]:
# get the time series portion for test set
ts['test'] = ts_df[eval_test_limit:]
print('{} lectures in test time series from {} to {}'.format(ts['test'].count()[0],
                                                             ts['test'].index[0],
                                                             ts['test'].index[-1]))

2263 lectures in test time series from 2018-04-28 17:00:00 to 2018-07-31 23:00:00


In [26]:
# start prototype for building SLDB for transformer

In [27]:
# start with time series for training set
# how many lectures-columns in time series?
ts['train'].shape

(18103, 7)

In [28]:
ts['train']

Unnamed: 0_level_0,kw_scaled,sin_hour,cos_hour,sin_day,cos_day,sin_month,cos_month
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01 00:00:00,0.274317,0.000000,1.000000e+00,0.207912,0.978148,0.5,0.866025
2016-01-01 01:00:00,0.217363,0.258819,9.659258e-01,0.207912,0.978148,0.5,0.866025
2016-01-01 02:00:00,0.168545,0.500000,8.660254e-01,0.207912,0.978148,0.5,0.866025
2016-01-01 03:00:00,0.122996,0.707107,7.071068e-01,0.207912,0.978148,0.5,0.866025
2016-01-01 04:00:00,0.080440,0.866025,5.000000e-01,0.207912,0.978148,0.5,0.866025
...,...,...,...,...,...,...,...
2018-01-24 04:00:00,0.073374,0.866025,5.000000e-01,-0.951057,0.309017,0.5,0.866025
2018-01-24 05:00:00,0.084031,0.965926,2.588190e-01,-0.951057,0.309017,0.5,0.866025
2018-01-24 06:00:00,0.180768,1.000000,6.123234e-17,-0.951057,0.309017,0.5,0.866025
2018-01-24 07:00:00,0.264623,0.965926,-2.588190e-01,-0.951057,0.309017,0.5,0.866025


In [29]:
# review this link to pass directly from NumPy arrays to TFRecord
# https://stackoverflow.com/questions/45427637/numpy-to-tfrecords-is-there-a-more-simple-way-to-handle-batch-inputs-from-tfrec/45428167#45428167

In [None]:
# SLDB for transformer has the following features in each row:
# source tensor: kw_scaled, sin_hour, cos_hour, sin_day, cos_day, sin_month, cos_month (?, 168, 7)
# target tensor: kw_scaled, sin_hour, cos_hour, sin_day, cos_day, sin_month, cos_month (?, 168, 7)

In [30]:
# data structure to convert to TFRecords: list of NumPy arrays

In [31]:
# build all the possible sub-series of sldb['embedding']['hourly'] elements (the embedding dimension)

In [36]:
m = sldb['embedding']['hourly']

In [49]:
# initialize a sub-dictionary for SLDB stats
sldb['stats'] = dict()

In [57]:
# a dictionary to store row arrays lists for all the stages
results = dict()

# iterate on stages
for stage in ['train', 'eval', 'test']:
    
    # a temporal, full-size, list to store source/target tensors (n_rows, m, 7)
    row_arrays_list = list()
    
    for start_value in range(ts[stage].shape[0] - m + 1):
        # start_value, end_value are the indexes in the dataframe that define the time window sub-series
        end_value = start_value + m
        # get the time window sub-series
        sub_series = ts[stage][start_value: end_value]
        # pass the sub-series to a NumPy array of shape [m, features] V.gr. [168, 7]
        # discard the timestamp index before
        row_array = sub_series.reset_index(drop=True).to_numpy()
        # stack all the generated row arrays in a master list
        row_arrays_list.append(row_array)
        
    # report stage completion
    print('Dataset created for {} stage with {} total rows.'.format(stage, len(row_arrays_list)-1))
    
    # given the nature of the autoregressive transformer
    # (target is source, shifted once to the right):
    # row 0 is the source and row 1 is the target
    # row 1 is the source and row 2 is the target
    # ...
    # row n-1 is the source and row n is the target
    
    # add a sub-dictionary for stage results
    results[stage] = dict()
    # then, source and target lists are easy to build
    results[stage]['source'] = row_arrays_list[:-1]   # from the first row to the one before the last
    results[stage]['target'] = row_arrays_list[1:]    # from the second row to the last one
    
    # add a sub-dictionary for stage stats
    sldb['stats'][stage] = dict()
    # pass number of rows to SLDB statistics dictionary
    # the number of rows in source and target is equivalent, then use any of them
    if len(results[stage]['source']) == len(results[stage]['target']):
        sldb['stats'][stage]['n_rows'] = len(results[stage]['source'])
    

Dataset created for train stage with 17935 total rows.
Dataset created for eval stage with 2095 total rows.
Dataset created for test stage with 2095 total rows.


In [71]:
sldb

{'ts': 'CPE04115_H_kw_20201021084001',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'stats': {'test': {'n_rows': 2095},
  'train': {'n_rows': 17935},
  'eval': {'n_rows': 2095}}}

In [59]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [66]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier
sldb_specs = 'ARTRFDC_{:03d}'.format(sldb['embedding']['hourly'])

sldb_specs

'ARTRFDC_168'

In [67]:
# build a time-based identifer for the SLDB
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'CPE04115_H_kw_20201021084001_ARTRFDC_168'

In [69]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)
sldb_dir

'/home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20201021084001_ARTRFDC_168'

In [70]:
try:
    os.mkdir(sldb_dir)
    print('Directory {} was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(sldb_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20201021084001_ARTRFDC_168 was created.


In [77]:
results['train']['source'][0].shape

(168, 7)

In [78]:
for stage in stages:
    N_ROWS = sldb['stats'][stage]['n_rows']
    filename = '{}/{}.tfrecord'.format(sldb_dir, stage)

    with tf.io.TFRecordWriter(filename) as writer:
        for row in np.arange(N_ROWS):
            example = tf.train.Example(
                # features within the example
                features=tf.train.Features(
                    # individual feature definition
                    feature={'source': _float_feature_from_list_of_values(results[stage]['source'][row].flatten()),
                             'target': _float_feature_from_list_of_values(results[stage]['target'][row].flatten())
                             # ToDo: persist source or target timestamps to be used during prediction
                             # 'timestamp': _bytes_feature_from_list_of_values(tfrecords[stage]['timestamps'][row])
                             }
                )
            )
            serialized_example = example.SerializeToString()
            writer.write(serialized_example)

In [79]:
# build a path for the json file
json_filename = '{}/sldb.json'.format(sldb_dir)

In [80]:
# persist the final, compact dictionary to JSON
with open(json_filename, 'w') as filename:
    json.dump(sldb, filename, indent=4)

In [81]:
# do not forget to sync sldbs/ from local to GS after the previous operations!
!gsutil rsync -d -r /home/developer/gcp/cbidmltsf/sldbs gs://cbidmltsf/sldbs

Building synchronization state...
Starting synchronization...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20201021084001_ARTRFDC_168/eval.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20201021084001_ARTRFDC_168/sldb.json [Content-Type=application/json]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20201021084001_ARTRFDC_168/test.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20201021084001_ARTRFDC_168/train.tfrecord [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<ht