In [1]:
# make similar SLDBs for DMSLSTM, EDSLSTM, and ARTRFDC architecture models
# from the same time series in the MIRD Parquet archive

# make sure the time series are normalized after being split for
# seen data (training and evaluation) and unseen data (test)
# example of similar SLDBs required:

# hourly, day-ahead

# DMSLSTM_TPU_006: balanced time-resolution [8, 8, 8] LSTM stacks,
#                  [[64, 128], [64, 128], [64, 128]],
#                  [512, 128, 24] dense layer (vector output)
#                  direct, 24-step ahead

# EDSLSTM_TPU_011: m=64, h=256, 'elu' encoder-3/decoder-2, batch_normalization, dense=[1]
#                  direct, 24-step ahead

# ARTRFDC_TPU_001: m=168, 2 decoder layers, 2 heads, ff=1024, dense=[1]
#                  iterative, forecast window up to 168-step ahead

In [2]:
import os
import numpy as np
import pandas as pd
import pyarrow
import time
import json
import joblib

In [3]:
import tensorflow as tf

In [4]:
# once scaled time series for building train, eval, and test datasets have been persisted
# SLDB files can be produced loading the corresponding pickle files

In [5]:
# files that already exist in the time series directory
# scaler_train_eval.save: scaler of the joint time series of data seen by the model
# scaler_test.save: scaler of the time series of data unseen by the model
# ts.json: description dictionary of the time series
# ts_train.pkl
# ts_eval.pkl
# ts_test.pkl: pickle files of the time series, normalized after splitting

In [6]:
# get the time series for a given identifier
identifier = 'CPE04115_H_kw_20210526212214'

In [7]:
# then, build the time series directory
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(identifier)

In [8]:
# a path to the JSON file that describes the time series
json_filename = '{}/ts.json'.format(time_series_folder)

In [9]:
# load the time series specs for further use
with open(json_filename, 'r') as input_file:
    ts_dict = json.load(input_file)

In [10]:
ts_dict

{'device': 'CPE04115',
 'resolution': 'hourly',
 'variable': 'kw_scaled',
 'train': {'start': '2016-01-01 00:00:00',
  'end': '2017-12-31 23:00:00',
  'num_lectures': 17542},
 'eval': {'start': '2018-01-01 00:00:00',
  'end': '2018-04-30 23:00:00',
  'num_lectures': 2879},
 'test': {'start': '2018-05-01 00:00:00',
  'end': '2018-07-31 23:00:00',
  'num_lectures': 2208},
 'identifier': 'CPE04115_H_kw_20210526212214'}

In [11]:
# a dictionary to manage the time series for the different model stages
ts = dict()

In [12]:
# define stages as string values
stages = ['train', 'eval', 'test']

In [13]:
# load scaled time series for stages from pickle files
for stage in stages:
    ts[stage] = pd.read_pickle('{}/ts_{}.pkl'.format(time_series_folder, stage))

In [14]:
# confirm time series are loaded to dictionary
ts['train']['kw_scaled'].count(), ts['eval']['kw_scaled'].count(), ts['test']['kw_scaled'].count()

(17542, 2879, 2208)

### normalized time series loaded, proceed with SLDB construction

### building an SLDB for DMSLSTM architecture based on the following configuration dictionary

In [15]:
sldb = {
    'ts': identifier,
    'embedding': {
        'hourly': 8,
        'daily': 8,
        'weekly': 8
    },
    'tau': {
        'hourly': 1,
        'daily': 24,
        'weekly': 168        
    },
    'no_targets': 24
}

In [16]:
# a function to one-hot encode a timestamp,
# used for single positional encoding in DMSLSTM
def one_hot_encode(timestamp):
    # input: a timestamp
    # output: a 7-bit list encoding the week-day, and a 24-bit list encoding the day-hour
    fv_weekday = np.zeros(7)
    fv_hour = np.zeros(24)
    fv_weekday[timestamp.weekday()] = 1.
    fv_hour[timestamp.hour] = 1.
    return list(fv_weekday), list(fv_hour)

In [17]:
# a function to build the features in the DMSLSTM SLDB
def make_features_targets_timestamps_ohvs(time_series, m, tau, n_targets):
    """
    Input:
           time series: original time series
           m: embedding dimension
           tau: lag
           n_targets: number of targets to predict
    Output:
           features: list of features
           targets: list of targets
           timestamps: list of target (target) timestamps
           oh_wds: list of one-hot vectors describing weekday of timestamp
           oh_dhs: list of one-hot vectors describing hour of the day of timestamp
    """
    # a couple of empty lists to store feature vectors and targets
    features = []
    targets = []
    timestamps = []
    oh_wds = []
    oh_dhs = []
    sequence = range(m * tau, time_series.shape[0] - n_targets + 1)
    for i in sequence:
        # uncomment the following line to preview features sequence timestamps (to verify the functionality)
        # features.append(list(time_series.iloc[(i - m * tau):i:tau].index))
        features.append(list(time_series.iloc[(i - m * tau):i:tau]))
        # uncomment the following line to preview targets sequence timestamps (to verify the functionality)
        # targets.append(list(time_series.iloc[i:(i + n_targets):1].index))
        targets.append(list(time_series.iloc[i:(i + n_targets):1]))
        # get the timestamps for the target values (just one for the first experiment)
        targets_timestamps_list = list(time_series.index[i:(i + n_targets):1])
        # EXTRACT TIMESTAMPS AS BYTES FOR TFRECORD PERSISTENCE
        targets_timestamps_list_as_bytes = [timestamp.strftime("%Y-%m-%d %H:%M:%S").encode() for timestamp in
                                           targets_timestamps_list]
        timestamps.append(targets_timestamps_list_as_bytes)
        # build one-hot vectors for week-day and day-hour
        # pass the timestamp(s) in the list, not the list!
        # important, only the first timestamp in the targets list is used
        # to build one-hot-encoded vectors for positional encoding
        oh_wd_vectors, oh_dh_vectors = one_hot_encode(targets_timestamps_list[0])
        # the one-hot-encode function already returns lists, then,
        oh_wds.append(oh_wd_vectors)
        oh_dhs.append(oh_dh_vectors)

    # uncomment the following line to return NumPy arrays instead of Python lists
    # features, targets, timestamps = np.array(features), np.array(targets), np.array(timestamps)

    return features, targets, timestamps, oh_wds, oh_dhs

In [18]:
# create a dictionary to temporarily store the following SLDBs:
# train (hourly, daily, weekly, targets, timestamps)
# eval (hourly, daily, weekly, targets, timestamps)
# test (hourly, daily, weekly, targets, timestamps)

In [19]:
sldb_full = {
    'train': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    },
    'eval': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    },
    'test': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    }
}

In [20]:
# assign the variable to build the forecast over
# get it from the time series descriptor dictionary
variable = ts_dict['variable']
variable

'kw_scaled'

In [21]:
# a list to iterate on data resolutions
resolutions = [
    'hourly',
    'daily',
    'weekly'
]

### build the SLDB for DMSLSTM architecture

In [22]:
for stage in stages:
    # train, eval, test
    # for component_key in sldb['components'].keys():
    for resolution in resolutions:
        # hourly, daily, weekly
        sldb_full[stage][resolution]['features'], \
        sldb_full[stage][resolution]['targets'], \
        sldb_full[stage][resolution]['timestamps'], \
        sldb_full[stage][resolution]['oh_wds'], \
        sldb_full[stage][resolution]['oh_dhs'] = \
        make_features_targets_timestamps_ohvs(
            ts[stage][variable],
            sldb['embedding'][resolution],
            sldb['tau'][resolution],
            sldb['no_targets'])

In [23]:
# verify that the target is stored as a list with number of elements equal to no_targets
len(sldb_full['test']['hourly']['targets'][0]) == sldb['no_targets']

True

In [24]:
# a list to iterate on the sldb items
items = ['features', 'targets', 'timestamps', 'oh_wds', 'oh_dhs']

In [25]:
# a dictionary to collect statistics
sldb['stats'] = {
    'train': {
        'hourly': {},
        'daily': {},
        'weekly': {},
    },
    'eval': {
        'hourly': {},
        'daily': {},
        'weekly': {},
    },
    'test': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    }
}

In [26]:
# report statistics on stages and resolutions of SLDBs
# and persist them to the sldb['stats'] level
for stage in stages:
    for resolution in resolutions:
        for item in items:
            # fill the values in the stats sub-dictionary
            sldb['stats'][stage][resolution][item] = len(sldb_full[stage][resolution][item])
            # timestamps are persisted as bytes, as in b'YYYY-MM-DD HH:MM;SS'
            # but are required as strings, as in 'YYYY-MM-DD HH:MM;SS'
            from_timestamp_str = sldb_full[stage][resolution]['timestamps'][0][0].decode()
            sldb['stats'][stage][resolution]['from'] = from_timestamp_str
            to_timestamp_str = sldb_full[stage][resolution]['timestamps'][-1][0].decode()
            sldb['stats'][stage][resolution]['to'] = to_timestamp_str
            # and log them
            print('{0} {3} / {1} / {2} from {4} to {5}'.format(len(sldb_full[stage][resolution][item]),
                                                               stage,
                                                               resolution,
                                                               item,
                                                               from_timestamp_str,
                                                               to_timestamp_str))

17487 features / train / hourly from 2016-01-01 08:00:00 to 2017-12-30 00:00:00
17487 targets / train / hourly from 2016-01-01 08:00:00 to 2017-12-30 00:00:00
17487 timestamps / train / hourly from 2016-01-01 08:00:00 to 2017-12-30 00:00:00
17487 oh_wds / train / hourly from 2016-01-01 08:00:00 to 2017-12-30 00:00:00
17487 oh_dhs / train / hourly from 2016-01-01 08:00:00 to 2017-12-30 00:00:00
17303 features / train / daily from 2016-01-09 00:00:00 to 2017-12-30 00:00:00
17303 targets / train / daily from 2016-01-09 00:00:00 to 2017-12-30 00:00:00
17303 timestamps / train / daily from 2016-01-09 00:00:00 to 2017-12-30 00:00:00
17303 oh_wds / train / daily from 2016-01-09 00:00:00 to 2017-12-30 00:00:00
17303 oh_dhs / train / daily from 2016-01-09 00:00:00 to 2017-12-30 00:00:00
16151 features / train / weekly from 2016-02-26 00:00:00 to 2017-12-30 00:00:00
16151 targets / train / weekly from 2016-02-26 00:00:00 to 2017-12-30 00:00:00
16151 timestamps / train / weekly from 2016-02-26 00

In [27]:
# in train set, verify resolution-based datasets end in the same timestamp
print(sldb['stats']['train']['hourly']['to'] == sldb['stats']['train']['daily']['to'] == sldb['stats']['train']['weekly']['to'])

True


In [28]:
# in eval set, verify resolution-based datasets end in the same timestamp
print(sldb['stats']['eval']['hourly']['to'] == sldb['stats']['eval']['daily']['to'] == sldb['stats']['eval']['weekly']['to'])

True


In [29]:
# in test set, verify resolution-based datasets end in the same timestamp
print(sldb['stats']['test']['hourly']['to'] == sldb['stats']['test']['daily']['to'] == sldb['stats']['test']['weekly']['to'])

True


In [30]:
# get the number of rows in the smaller resolution-based dataset, for alignment purposes
for stage in stages:
    sldb['stats'][stage]['trimmed_to_count'] = min([sldb['stats'][stage][resolution]['features'] for resolution in resolutions])
    print('Dataset on {} stage will be trimmed to {} rows.'.format(stage, sldb['stats'][stage]['trimmed_to_count']))

Dataset on train stage will be trimmed to 16151 rows.
Dataset on eval stage will be trimmed to 1488 rows.
Dataset on test stage will be trimmed to 817 rows.


In [31]:
# a new dictionary with final, trimmed data
tfrecords = {
    'train': {}, # hourly, daily, weekly, targets, timestamps, oh_wds, oh_dhs to be added
    'eval': {}, # hourly, daily, weekly, targets, timestamps, oh_wds, oh_dhs to be added
    'test': {}, # hourly, daily, weekly, targets, timestamps, oh_wds, oh_dhs to be added
}

In [32]:
for stage in stages:
    # isolate this value, just for readability
    value_to_trim = sldb['stats'][stage]['trimmed_to_count']
    tfrecords[stage]['hourly'] = sldb_full[stage]['hourly']['features'][-value_to_trim:]
    tfrecords[stage]['daily'] = sldb_full[stage]['daily']['features'][-value_to_trim:]
    tfrecords[stage]['weekly'] = sldb_full[stage]['weekly']['features'][-value_to_trim:]
    # targets, target timestamps, and first-target-timestamp-one-hot vectors
    # can be acquired from any resolution-based, temporary dataset (hourly, daily, weekly)
    tfrecords[stage]['targets'] = sldb_full[stage]['hourly']['targets'][-value_to_trim:]
    tfrecords[stage]['timestamps'] = sldb_full[stage]['hourly']['timestamps'][-value_to_trim:]
    tfrecords[stage]['oh_wds'] = sldb_full[stage]['hourly']['oh_wds'][-value_to_trim:]
    tfrecords[stage]['oh_dhs'] = sldb_full[stage]['hourly']['oh_dhs'][-value_to_trim:]

In [33]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [34]:
# a function to encode byte values for serialized examples
def _bytes_feature_from_list_of_values(list_of_values):
    """Returns a bytes_list from a list of strings / bytes."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_values))

In [35]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier
sldb_specs = '{:03d}{:03d}_' \
             '{:03d}{:03d}_' \
             '{:03d}{:03d}_' \
             '{:03d}'.format(sldb['embedding']['hourly'],
                             sldb['tau']['hourly'],
                             sldb['embedding']['daily'],
                             sldb['tau']['daily'],
                             sldb['embedding']['weekly'],
                             sldb['tau']['weekly'],
                             sldb['no_targets']
                            )

In [36]:
sldb_specs

'008001_008024_008168_048'

In [37]:
# build the identifer for the SLDB folder, with the specs dictionary and the time series identifier
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'CPE04115_H_kw_20210526212214_008001_008024_008168_048'

In [38]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)

In [39]:
try:
    os.mkdir(sldb_dir)
    print('Directory {} was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(sldb_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_008001_008024_008168_048 was created.


### now persist SLDBs as TFRecords

In [40]:
for stage in stages:
    N_ROWS = sldb['stats'][stage]['trimmed_to_count']
    filename = '{}/{}.tfrecord'.format(sldb_dir, stage)
    with tf.io.TFRecordWriter(filename) as writer:
        # get an iterable with the indexes of the NumPy array to be stored in the TFRecord file
        for row in np.arange(N_ROWS):
            example = tf.train.Example(
                # features within the example
                features=tf.train.Features(
                    # individual feature definition
                    # [lecture[0] for lecture in Xadj_train[row]] flattens the adjacent hours array
                    feature={
                        # the m-hourly vector of hourly lectures previous to the target
                        'hourly': _float_feature_from_list_of_values(tfrecords[stage]['hourly'][row]),
                        # the m-daily vector of daily lectures previous to the target
                        'daily': _float_feature_from_list_of_values(tfrecords[stage]['daily'][row]),
                        # the m-weekly vector of weekly lectures previous to the target
                        'weekly': _float_feature_from_list_of_values(tfrecords[stage]['weekly'][row]),
                        # the no_targets vector of target lectures (no_target-steps-ahead)
                        'target': _float_feature_from_list_of_values(tfrecords[stage]['targets'][row]),
                        # the 7d one hot vector of the weekday (from the first target lecture)
                        'oh_wd': _float_feature_from_list_of_values(tfrecords[stage]['oh_wds'][row]),
                        # the 24d one hot vector of the dayhour (from the first target lecture)
                        'oh_dh': _float_feature_from_list_of_values(tfrecords[stage]['oh_dhs'][row]),
                        # the no_targets vector of target timestamps (no_target-steps-ahead)
                        'timestamp': _bytes_feature_from_list_of_values(tfrecords[stage]['timestamps'][row])
                    }
                )
            )
            serialized_example = example.SerializeToString()
            writer.write(serialized_example)

In [41]:
# build a path for the json file
json_filename = '{}/sldb.json'.format(sldb_dir)

In [42]:
# persist the final, compact dictionary to JSON
with open(json_filename, 'w') as filename:
    json.dump(sldb, filename, indent=4)

### do not forget to sync sldbs/ from local to Google Storage after the previous operations!

In [43]:
!gsutil rsync -d -r /home/developer/gcp/cbidmltsf/sldbs gs://cbidmltsf/sldbs

Building synchronization state...
Starting synchronization...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_008001_008024_008168_048/eval.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_008001_008024_008168_048/sldb.json [Content-Type=application/json]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_008001_008024_008168_048/test.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_008001_008024_008168_048/train.tfrecord [Content-Type=application/octet-stream]...
| [4 files][ 27.4 MiB/ 27.4 MiB]                                                
Operation completed over 4 objects/27.4 MiB.                                     
