## This is the code to build SLDBs for DMSLSTM architecture.
## It superseedes the make_sldb.py script

In [1]:
# replace the variable 'labels' with 'targets', as the latter is more adequate for regression problems

In [2]:
import os
import numpy as np
import pandas as pd
import pyarrow
import json
import joblib

In [3]:
# scale datasets to improve neural networks performance
from sklearn.preprocessing import MinMaxScaler

In [4]:
from datetime import datetime, timedelta

In [5]:
import tensorflow as tf

In [6]:
# files in the time series directory
# scaler.save
# ts.json
# ts.pkl

In [7]:
# files in the SLDB directory:
# train.tfrecord
# eval.tfrecord
# test.tfrecord
# sldb.json

In [6]:
# a dictionary to configure the SLDB
# ToDo: transfer this dictionary to dplstm/configs/sldb_config.py

# modify the dictionary structure:
# no_targets must be the same for all components, then move it to an upper level
# remove components and use the same structure as in architecture_parameters
sldb = {
    'ts': 'CPE04115_H_kw_20201021084001',
    'components': {
        'hourly': {
            'm': 8,
            'tau': 1,
            'no_targets': 24
        },
        'daily': {
            'm': 8,
            'tau': 24,
            'no_targets': 24
        },
        'weekly': {
            'm': 4,
            'tau': 168,
            'no_targets': 24
        }
    }
}

In [7]:
# time series was built and persisted in a different code
# SLDB constructions begins here

In [8]:
# load the required time series
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(sldb['ts'])
pickle_filename = '{}/ts.pkl'.format(time_series_folder)
ts_df = pd.read_pickle(pickle_filename)

In [9]:
# evaluation stage is not used for TPU-based training,
# however, evaluation dataset might be useful to get stats from CPU-based training
stages = ['train', 'eval', 'test']

In [10]:
# split data set into train/eval/test at time series level
# to avoid data overlapping at SLDB level
split = np.array([0.8, 0.9, 1.0])

In [11]:
# get indexes of the scaled time series for train, validation, and test thresholds
train_eval_limit = np.int(ts_df.count()*split[0])
eval_test_limit = np.int(ts_df.count()*split[1])

In [12]:
# a dictionary to manage the time series for the different model stages
ts = dict()

In [13]:
# get the time series portion for train set
ts['train'] = ts_df[:train_eval_limit]
print('{0} lectures in train time series from {1} to {2}'.format(ts['train'].count()[0],
                                                                 ts['train'].index[0],
                                                                 ts['train'].index[-1]))

18103 lectures in train time series from 2016-01-01 00:00:00 to 2018-01-24 08:00:00


In [14]:
# get the time series portion for eval set
ts['eval'] = ts_df[train_eval_limit:eval_test_limit]
print('{0} lectures in eval time series from {1} to {2}'.format(ts['eval'].count()[0],
                                                                ts['eval'].index[0],
                                                                ts['eval'].index[-1]))

2263 lectures in eval time series from 2018-01-24 09:00:00 to 2018-04-28 16:00:00


In [15]:
# get the time series portion for test set
ts['test'] = ts_df[eval_test_limit:]
print('{} lectures in test time series from {} to {}'.format(ts['test'].count()[0],
                                                             ts['test'].index[0],
                                                             ts['test'].index[-1]))

2263 lectures in test time series from 2018-04-28 17:00:00 to 2018-07-31 23:00:00


In [16]:
# a function to one-hot encode a timestamp
def one_hot_encode(timestamp):
    # input: a timestamp
    # output: a 7-bit list encoding the week-day, and a 24-bit list encoding the day-hour
    fv_weekday = np.zeros(7)
    fv_hour = np.zeros(24)
    fv_weekday[timestamp.weekday()] = 1.
    fv_hour[timestamp.hour] = 1.
    return list(fv_weekday), list(fv_hour)

In [19]:
def make_features_targets_timestamps_ohvs(time_series, m, tau, n_targets):
    """
    Input:
           time series: original time series
           m: embedding dimension
           tau: lag
           n_targets: number of targets to predict
    Output:
           features: list of features
           targets: list of targets
           timestamps: list of target (target) timestamps
           oh_wds: list of one-hot vectors describing weekday of timestamp
           oh_dhs: list of one-hot vectors describing hour of the day of timestamp
    """
    # a couple of empty lists to store feature vectors and targets
    features = []
    targets = []
    timestamps = []
    oh_wds = []
    oh_dhs = []
    sequence = range(m * tau, time_series.shape[0] - n_targets + 1)
    for i in sequence:
        # uncomment the following line to preview features sequence timestamps (to verify the functionality)
        # features.append(list(time_series.iloc[(i - m * tau):i:tau].index))
        features.append(list(time_series.iloc[(i - m * tau):i:tau]))
        # uncomment the following line to preview targets sequence timestamps (to verify the functionality)
        # targets.append(list(time_series.iloc[i:(i + n_targets):1].index))
        targets.append(list(time_series.iloc[i:(i + n_targets):1]))
        # get the timestamps for the target values (just one for the first experiment)
        targets_timestamps_list = list(time_series.index[i:(i + n_targets):1])
        # EXTRACT TIMESTAMPS AS BYTES FOR TFRECORD PERSISTENCE
        targets_timestamps_list_as_bytes = [timestamp.strftime("%Y-%m-%d %H:%M:%S").encode() for timestamp in
                                           targets_timestamps_list]
        timestamps.append(targets_timestamps_list_as_bytes)
        # build one-hot vectors for week-day and day-hour
        # pass the timestamp(s) in the list, not the list!
        oh_wd_vectors, oh_dh_vectors = one_hot_encode(targets_timestamps_list[0])
        # the one-hot-encode function already returns lists, then,
        oh_wds.append(oh_wd_vectors)
        oh_dhs.append(oh_dh_vectors)

    # uncomment the following line to return NumPy arrays instead of Python lists
    # features, targets, timestamps = np.array(features), np.array(targets), np.array(timestamps)

    return features, targets, timestamps, oh_wds, oh_dhs

In [20]:
# create a dictionary to temporarily store the following SLDBs:
# train (hourly, daily, weekly, targets, timestamps)
# test (hourly, daily, weekly, targets, timestamps)
# no eval(uation) dataset as the model will be trained on TPUs

In [21]:
sldb_full = {
    'train': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    },
    'eval': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    },
    'test': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    }
}

In [22]:
# assign the variable to build the forecast over
variable = 'kw_scaled'

In [23]:
# BUILD ALL THE SLDBs!!!
for stage in stages:
    # train, eval, test
    for component_key in sldb['components'].keys():
        # hourly, daily, weekly
        sldb_full[stage][component_key]['features'], sldb_full[stage][component_key]['targets'], sldb_full[stage][component_key]['timestamps'], sldb_full[stage][component_key]['oh_wds'], sldb_full[stage][component_key]['oh_dhs'] = make_features_targets_timestamps_ohvs(
            ts[stage][variable],
            sldb['components'][component_key]['m'],
            sldb['components'][component_key]['tau'],
            sldb['components'][component_key]['no_targets'])

In [26]:
# verify that the target is stored as a no_targets-element list
sldb_full['test']['weekly']['targets'][0]

[0.6525345336504569,
 0.6731292174438517,
 0.69301192311566,
 0.7302509354881197,
 0.7897185400962217,
 0.7017191288920568]

In [27]:
# a list with forecasting resolutions
resolutions = [key for key in sldb['components'].keys()]

In [28]:
items = ['features', 'targets', 'timestamps', 'oh_wds', 'oh_dhs']

In [29]:
# a dictionary to collect statistics
sldb['stats'] = {
    'train': {
        'hourly': {},
        'daily': {},
        'weekly': {},
    },
    'eval': {
        'hourly': {},
        'daily': {},
        'weekly': {},
    },
    'test': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    }
}

In [30]:
# report statistics on stages and resolutions of SLDBs
# and persist them to the sldb['stats'] level
for stage in stages:
    for resolution in resolutions:
        for item in items:
            # fill the values in the stats sub-dictionary
            sldb['stats'][stage][resolution][item] = len(sldb_full[stage][resolution][item])
            # timestamps are persisted as bytes, as in b'YYYY-MM-DD HH:MM;SS'
            # but are required as strings, as in 'YYYY-MM-DD HH:MM;SS'
            from_timestamp_str = sldb_full[stage][resolution]['timestamps'][0][0].decode()
            sldb['stats'][stage][resolution]['from'] = from_timestamp_str
            to_timestamp_str = sldb_full[stage][resolution]['timestamps'][-1][0].decode()
            sldb['stats'][stage][resolution]['to'] = to_timestamp_str
            # and log them
            print('{0} {3} / {1} / {2} from {4} to {5}'.format(len(sldb_full[stage][resolution][item]),
                                                               stage,
                                                               resolution,
                                                               item,
                                                               from_timestamp_str,
                                                               to_timestamp_str))

18090 features / train / hourly from 2016-01-01 08:00:00 to 2018-01-24 03:00:00
18090 targets / train / hourly from 2016-01-01 08:00:00 to 2018-01-24 03:00:00
18090 timestamps / train / hourly from 2016-01-01 08:00:00 to 2018-01-24 03:00:00
18090 oh_wds / train / hourly from 2016-01-01 08:00:00 to 2018-01-24 03:00:00
18090 oh_dhs / train / hourly from 2016-01-01 08:00:00 to 2018-01-24 03:00:00
17906 features / train / daily from 2016-01-09 00:00:00 to 2018-01-24 03:00:00
17906 targets / train / daily from 2016-01-09 00:00:00 to 2018-01-24 03:00:00
17906 timestamps / train / daily from 2016-01-09 00:00:00 to 2018-01-24 03:00:00
17906 oh_wds / train / daily from 2016-01-09 00:00:00 to 2018-01-24 03:00:00
17906 oh_dhs / train / daily from 2016-01-09 00:00:00 to 2018-01-24 03:00:00
17426 features / train / weekly from 2016-01-29 00:00:00 to 2018-01-24 03:00:00
17426 targets / train / weekly from 2016-01-29 00:00:00 to 2018-01-24 03:00:00
17426 timestamps / train / weekly from 2016-01-29 00

In [31]:
# in train set, verify resolution-based datasets end in the same timestamp
print(sldb['stats']['train']['hourly']['to'] == sldb['stats']['train']['daily']['to'] == sldb['stats']['train']['weekly']['to'])

True


In [32]:
# in eval set, verify resolution-based datasets end in the same timestamp
print(sldb['stats']['eval']['hourly']['to'] == sldb['stats']['eval']['daily']['to'] == sldb['stats']['eval']['weekly']['to'])

True


In [33]:
# in test set, verify resolution-based datasets end in the same timestamp
print(sldb['stats']['test']['hourly']['to'] == sldb['stats']['test']['daily']['to'] == sldb['stats']['test']['weekly']['to'])

True


In [34]:
# get the number of rows in the smaller resolution-based dataset, for alignment purposes
for stage in stages:
    sldb['stats'][stage]['trimmed_to_count'] = min([sldb['stats'][stage][resolution]['features'] for resolution in resolutions])
    print('Dataset on {} stage was trimmed to {} rows.'.format(stage, sldb['stats'][stage]['trimmed_to_count']))

Dataset on train stage was trimmed to 17426 rows.
Dataset on eval stage was trimmed to 1586 rows.
Dataset on test stage was trimmed to 1586 rows.


In [35]:
# a new dictionary with final, trimmed data
tfrecords = {
    'train': {}, # hourly, daily, weekly, targets, timestamps, oh_wds, oh_dhs to be added
    'eval': {}, # hourly, daily, weekly, targets, timestamps, oh_wds, oh_dhs to be added
    'test': {}, # hourly, daily, weekly, targets, timestamps, oh_wds, oh_dhs to be added
}

In [36]:
for stage in stages:
    # isolate this value, just for readability
    value_to_trim = sldb['stats'][stage]['trimmed_to_count']
    tfrecords[stage]['hourly'] = sldb_full[stage]['hourly']['features'][-value_to_trim:]
    tfrecords[stage]['daily'] = sldb_full[stage]['daily']['features'][-value_to_trim:]
    tfrecords[stage]['weekly'] = sldb_full[stage]['weekly']['features'][-value_to_trim:]
    # targets and timestamps can be acquired from any resolution-based, temporary dataset (hourly, daily, weekly)
    tfrecords[stage]['targets'] = sldb_full[stage]['hourly']['targets'][-value_to_trim:]
    # find out the adequate way to persist timestamps (string?, bytes?)
    # in the meantime, do not persist them to tfrecord files
    tfrecords[stage]['timestamps'] = sldb_full[stage]['hourly']['timestamps'][-value_to_trim:]
    tfrecords[stage]['oh_wds'] = sldb_full[stage]['hourly']['oh_wds'][-value_to_trim:]
    tfrecords[stage]['oh_dhs'] = sldb_full[stage]['hourly']['oh_dhs'][-value_to_trim:]

In [37]:
# verify again specs for the contents in tfrecords dictionary
tfrecords['test']['targets'][0]

[0.6525345336504569,
 0.6731292174438517,
 0.69301192311566,
 0.7302509354881197,
 0.7897185400962217,
 0.7017191288920568]

In [38]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [39]:
# a function to encode byte values for serialized examples
def _bytes_feature_from_list_of_values(list_of_values):
    """Returns a bytes_list from a list of strings / bytes."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_values))

In [40]:
# ToDo: pass one-hot vectors as _int_features and decode when reading dataset???

In [41]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier
sldb_specs = '{:03d}{:03d}{:03d}_' \
             '{:03d}{:03d}{:03d}_' \
             '{:03d}{:03d}{:03d}'.format(sldb['components']['hourly']['m'],
                                         sldb['components']['hourly']['tau'],
                                         sldb['components']['hourly']['no_targets'],
                                         sldb['components']['daily']['m'],
                                         sldb['components']['daily']['tau'],
                                         sldb['components']['daily']['no_targets'],
                                         sldb['components']['weekly']['m'],
                                         sldb['components']['weekly']['tau'],
                                         sldb['components']['weekly']['no_targets'])

In [42]:
# build a time-based identifer for the SLDB
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'CPE04115_H_kw_20201021084001_008001006_008024006_004168006'

In [43]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)

In [44]:
try:
    os.mkdir(sldb_dir)
    print('Directory {} was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(sldb_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20201021084001_008001006_008024006_004168006 was created.


In [45]:
# now persist SLDBs as TFRecords
for stage in stages:
    N_ROWS = sldb['stats'][stage]['trimmed_to_count']
    filename = '{}/{}.tfrecord'.format(sldb_dir, stage)
    with tf.io.TFRecordWriter(filename) as writer:
        # get an iterable with the indexes of the NumPy array to be stored in the TFRecord file
        for row in np.arange(N_ROWS):
            example = tf.train.Example(
                # features within the example
                features=tf.train.Features(
                    # individual feature definition
                    # [lecture[0] for lecture in Xadj_train[row]] flattens the adjacent hours array
                    feature={'hourly': _float_feature_from_list_of_values(tfrecords[stage]['hourly'][row]),
                             'daily': _float_feature_from_list_of_values(tfrecords[stage]['daily'][row]),
                             'weekly': _float_feature_from_list_of_values(tfrecords[stage]['weekly'][row]),
                             'target': _float_feature_from_list_of_values(tfrecords[stage]['targets'][row]),
                             'oh_wd': _float_feature_from_list_of_values(tfrecords[stage]['oh_wds'][row]),
                             'oh_dh': _float_feature_from_list_of_values(tfrecords[stage]['oh_dhs'][row]),
                             # timestamps to be incorporated later as _byte_feature???
                             'timestamp': _bytes_feature_from_list_of_values(tfrecords[stage]['timestamps'][row])
                             }
                )
            )
            serialized_example = example.SerializeToString()
            writer.write(serialized_example)

In [46]:
# build a path for the json file
json_filename = '{}/sldb.json'.format(sldb_dir)

In [47]:
# persist the final, compact dictionary to JSON
with open(json_filename, 'w') as filename:
    json.dump(sldb, filename, indent=4)

In [48]:
# do not forget to sync sldbs/ from local to GS after the previous operations!