In [1]:
# replace the variable 'labels' with 'targets', as the latter is more adequate for regression problems

In [2]:
import os
import numpy as np
import pandas as pd
import pyarrow
import json
import joblib

In [3]:
# scale datasets to improve neural networks performance
from sklearn.preprocessing import MinMaxScaler

In [4]:
from datetime import datetime, timedelta

In [5]:
import tensorflow as tf

In [6]:
# files in the time series directory
# scaler.save
# ts.json
# ts.pkl

In [7]:
# files in the SLDB directory:
# train.tfrecord
# eval.tfrecord
# test.tfrecord
# sldb.json

In [8]:
# a dictionary to configure the SLDB
# ToDo: transfer this dictionary to dplstm/configs/sldb_config.py

# modify the dictionary structure:
# no_targets must be the same for all components, then move it to an upper level
# remove components and use the same structure as in architecture_parameters

# ToDo: build all sldb dictionaries on the basis of list-type parameters,
#  by iterating on them to avoid comments on the non-used resolutions, like
#  m = [8, 8, 8], tau = [1, 24, 168], no_targets = [24] or
#  m = [256], tau = [1], no_targets = [24]
sldb = {
    'ts': 'CPE04115_H_kw_20201021084001',
    'embedding': {
        'hourly': 256
    },
    'tau': {
        'hourly': 1
    },
    'no_targets': 24
}

In [9]:
# time series was built and persisted in a different code
# SLDB constructions begins here

In [10]:
# load the required time series
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(sldb['ts'])
pickle_filename = '{}/ts.pkl'.format(time_series_folder)
ts_df = pd.read_pickle(pickle_filename)

In [11]:
# evaluation stage is not used for TPU-based training,
# however, evaluation dataset might be useful to get stats from CPU-based training
stages = ['train', 'eval', 'test']

In [12]:
# split data set into train/eval/test at time series level
# to avoid data overlapping at SLDB level
split = np.array([0.8, 0.9, 1.0])

In [13]:
# get indexes of the scaled time series for train, validation, and test thresholds
train_eval_limit = np.int(ts_df.count()*split[0])
eval_test_limit = np.int(ts_df.count()*split[1])

In [14]:
# a dictionary to manage the time series for the different model stages
ts = dict()

In [15]:
# get the time series portion for train set
ts['train'] = ts_df[:train_eval_limit]
print('{0} lectures in train time series from {1} to {2}'.format(ts['train'].count()[0],
                                                                 ts['train'].index[0],
                                                                 ts['train'].index[-1]))

18103 lectures in train time series from 2016-01-01 00:00:00 to 2018-01-24 08:00:00


In [16]:
# get the time series portion for eval set
ts['eval'] = ts_df[train_eval_limit:eval_test_limit]
print('{0} lectures in eval time series from {1} to {2}'.format(ts['eval'].count()[0],
                                                                ts['eval'].index[0],
                                                                ts['eval'].index[-1]))

2263 lectures in eval time series from 2018-01-24 09:00:00 to 2018-04-28 16:00:00


In [17]:
# get the time series portion for test set
ts['test'] = ts_df[eval_test_limit:]
print('{} lectures in test time series from {} to {}'.format(ts['test'].count()[0],
                                                             ts['test'].index[0],
                                                             ts['test'].index[-1]))

2263 lectures in test time series from 2018-04-28 17:00:00 to 2018-07-31 23:00:00


In [18]:
# a function to one-hot encode a timestamp
def one_hot_encode(timestamp):
    # input: a timestamp
    # output: a 7-bit list encoding the week-day, and a 24-bit list encoding the day-hour
    fv_weekday = np.zeros(7)
    fv_hour = np.zeros(24)
    fv_weekday[timestamp.weekday()] = 1.
    fv_hour[timestamp.hour] = 1.
    return list(fv_weekday), list(fv_hour)

In [19]:
def make_features_targets_timestamps_ohvs(time_series, m, tau, n_targets):
    """
    Input:
           time series: original time series
           m: embedding dimension
           tau: lag
           n_targets: number of targets to predict
    Output:
           features: list of features
           targets: list of targets
           timestamps: list of target (target) timestamps
           oh_wds: list of one-hot vectors describing weekday of timestamp
           oh_dhs: list of one-hot vectors describing hour of the day of timestamp
    """
    # a couple of empty lists to store feature vectors and targets
    features = []
    targets = []
    timestamps = []
    oh_wds = []
    oh_dhs = []
    sequence = range(m * tau, time_series.shape[0] - n_targets + 1)
    for i in sequence:
        # uncomment the following line to preview features sequence timestamps (to verify the functionality)
        # features.append(list(time_series.iloc[(i - m * tau):i:tau].index))
        features.append(list(time_series.iloc[(i - m * tau):i:tau]))
        # uncomment the following line to preview targets sequence timestamps (to verify the functionality)
        # targets.append(list(time_series.iloc[i:(i + n_targets):1].index))
        targets.append(list(time_series.iloc[i:(i + n_targets):1]))
        # get the timestamps for the target values (just one for the first experiment)
        targets_timestamps_list = list(time_series.index[i:(i + n_targets):1])
        # EXTRACT TIMESTAMPS AS BYTES FOR TFRECORD PERSISTENCE
        targets_timestamps_list_as_bytes = [timestamp.strftime("%Y-%m-%d %H:%M:%S").encode() for timestamp in
                                           targets_timestamps_list]
        timestamps.append(targets_timestamps_list_as_bytes)
        # build one-hot vectors for week-day and day-hour
        # pass the timestamp(s) in the list, not the list!
        oh_wd_vectors, oh_dh_vectors = one_hot_encode(targets_timestamps_list[0])
        # the one-hot-encode function already returns lists, then,
        oh_wds.append(oh_wd_vectors)
        oh_dhs.append(oh_dh_vectors)

    # uncomment the following line to return NumPy arrays instead of Python lists
    # features, targets, timestamps = np.array(features), np.array(targets), np.array(timestamps)

    return features, targets, timestamps, oh_wds, oh_dhs

In [20]:
# create a dictionary to temporarily store the following SLDBs:
# train (hourly, daily, weekly, targets, timestamps)
# test (hourly, daily, weekly, targets, timestamps)
# no eval(uation) dataset as the model will be trained on TPUs

In [21]:
sldb_full = {
    'train': {
        'hourly': {},
    },
    'eval': {
        'hourly': {},
    },
    'test': {
        'hourly': {},
    }
}

In [22]:
# assign the variable to build the forecast over
# get it from the extracted time series dataframe
variable = ts_df.columns[0]
variable

'kw_scaled'

In [23]:
# a list to iterate on data resolutions
resolutions = [
    'hourly'
]

In [24]:
# BUILD ALL THE SLDBs!!!
for stage in stages:
    # train, eval, test
    # for component_key in sldb['components'].keys():
    for resolution in resolutions:
        # hourly, daily, weekly
        sldb_full[stage][resolution]['features'], \
        sldb_full[stage][resolution]['targets'], \
        sldb_full[stage][resolution]['timestamps'], \
        sldb_full[stage][resolution]['oh_wds'], \
        sldb_full[stage][resolution]['oh_dhs'] = \
        make_features_targets_timestamps_ohvs(
            ts[stage][variable],
            sldb['embedding'][resolution],
            sldb['tau'][resolution],
            sldb['no_targets'])

In [25]:
# verify that the target is stored as a no_targets-element list
len(sldb_full['test']['hourly']['targets'][0]) == sldb['no_targets']

True

In [26]:
# a list to iterate on the sldb items
items = ['features', 'targets', 'timestamps', 'oh_wds', 'oh_dhs']

In [27]:
# a dictionary to collect statistics
sldb['stats'] = {
    'train': {
        'hourly': {}
    },
    'eval': {
        'hourly': {}
    },
    'test': {
        'hourly': {}
    }
}

In [28]:
# report statistics on stages and resolutions of SLDBs
# and persist them to the sldb['stats'] level
for stage in stages:
    for resolution in resolutions:
        for item in items:
            # fill the values in the stats sub-dictionary
            sldb['stats'][stage][resolution][item] = len(sldb_full[stage][resolution][item])
            # timestamps are persisted as bytes, as in b'YYYY-MM-DD HH:MM;SS'
            # but are required as strings, as in 'YYYY-MM-DD HH:MM;SS'
            from_timestamp_str = sldb_full[stage][resolution]['timestamps'][0][0].decode()
            sldb['stats'][stage][resolution]['from'] = from_timestamp_str
            to_timestamp_str = sldb_full[stage][resolution]['timestamps'][-1][0].decode()
            sldb['stats'][stage][resolution]['to'] = to_timestamp_str
            # and log them
            print('{0} {3} / {1} / {2} from {4} to {5}'.format(len(sldb_full[stage][resolution][item]),
                                                               stage,
                                                               resolution,
                                                               item,
                                                               from_timestamp_str,
                                                               to_timestamp_str))

17824 features / train / hourly from 2016-01-11 16:00:00 to 2018-01-23 09:00:00
17824 targets / train / hourly from 2016-01-11 16:00:00 to 2018-01-23 09:00:00
17824 timestamps / train / hourly from 2016-01-11 16:00:00 to 2018-01-23 09:00:00
17824 oh_wds / train / hourly from 2016-01-11 16:00:00 to 2018-01-23 09:00:00
17824 oh_dhs / train / hourly from 2016-01-11 16:00:00 to 2018-01-23 09:00:00
1984 features / eval / hourly from 2018-02-04 01:00:00 to 2018-04-27 17:00:00
1984 targets / eval / hourly from 2018-02-04 01:00:00 to 2018-04-27 17:00:00
1984 timestamps / eval / hourly from 2018-02-04 01:00:00 to 2018-04-27 17:00:00
1984 oh_wds / eval / hourly from 2018-02-04 01:00:00 to 2018-04-27 17:00:00
1984 oh_dhs / eval / hourly from 2018-02-04 01:00:00 to 2018-04-27 17:00:00
1984 features / test / hourly from 2018-05-09 09:00:00 to 2018-07-31 00:00:00
1984 targets / test / hourly from 2018-05-09 09:00:00 to 2018-07-31 00:00:00
1984 timestamps / test / hourly from 2018-05-09 09:00:00 to 2

In [29]:
# get the number of rows in the smaller resolution-based dataset, for alignment purposes
for stage in stages:
    sldb['stats'][stage]['trimmed_to_count'] = min([sldb['stats'][stage][resolution]['features'] for resolution in resolutions])
    print('Dataset on {} stage was trimmed to {} rows.'.format(stage, sldb['stats'][stage]['trimmed_to_count']))

Dataset on train stage was trimmed to 17824 rows.
Dataset on eval stage was trimmed to 1984 rows.
Dataset on test stage was trimmed to 1984 rows.


In [30]:
# a new dictionary with final, trimmed data
tfrecords = {
    'train': {}, # hourly, targets, timestamps, oh_wds, oh_dhs to be added
    'eval': {}, # hourly, targets, timestamps, oh_wds, oh_dhs to be added
    'test': {}, # hourly, targets, timestamps, oh_wds, oh_dhs to be added
}

In [31]:
for stage in stages:
    # isolate this value, just for readability
    value_to_trim = sldb['stats'][stage]['trimmed_to_count']
    tfrecords[stage]['hourly'] = sldb_full[stage]['hourly']['features'][-value_to_trim:]
    # targets and timestamps can be acquired from any resolution-based, temporary dataset (hourly, daily, weekly)
    tfrecords[stage]['targets'] = sldb_full[stage]['hourly']['targets'][-value_to_trim:]
    # find out the adequate way to persist timestamps (string?, bytes?)
    # in the meantime, do not persist them to tfrecord files
    tfrecords[stage]['timestamps'] = sldb_full[stage]['hourly']['timestamps'][-value_to_trim:]
    tfrecords[stage]['oh_wds'] = sldb_full[stage]['hourly']['oh_wds'][-value_to_trim:]
    tfrecords[stage]['oh_dhs'] = sldb_full[stage]['hourly']['oh_dhs'][-value_to_trim:]

In [32]:
# verify again specs for the contents in tfrecords dictionary
tfrecords['test']['targets'][0]

[0.4169542211238255,
 0.46936634721910186,
 0.5373257822849928,
 0.5879978617414416,
 0.6495006856372554,
 0.656068083392084,
 0.5982212167930769,
 0.6293971815273058,
 0.6244791868419628,
 0.5759515637952541,
 0.5945040557186796,
 0.5904979198462939,
 0.5677866699721876,
 0.49785631832162225,
 0.36367672009730656,
 0.26492093866451816,
 0.17351968205024915,
 0.12694360730416743,
 0.11225160175709092,
 0.1099111383127902,
 0.11352371065333111,
 0.20790690827955427,
 0.2614323233418814,
 0.3075946915407082]

In [40]:
# get datasets from tfrecords dictionary and persist them as pickles
# to test transformer operations on a Jupyter notebook

In [44]:
train_hourly = np.array(tfrecords['train']['hourly'])

In [45]:
train_hourly.shape

(17824, 256)

In [46]:
path = '/home/developer/gcp/cbidmltsf/sldbs/256_train_hourly.npy'

In [47]:
with open(path, 'wb') as filename:
    np.save(filename, train_hourly)

In [48]:
# repeat operation for all required datasets

In [49]:
train_targets = np.array(tfrecords['train']['targets'])
path = '/home/developer/gcp/cbidmltsf/sldbs/256_to_24_train_targets.npy'
with open(path, 'wb') as filename:
    np.save(filename, train_targets)
    
print(train_targets.shape)

(17824, 24)


In [50]:
eval_hourly = np.array(tfrecords['eval']['hourly'])
path = '/home/developer/gcp/cbidmltsf/sldbs/256_to_24_eval_hourly.npy'
with open(path, 'wb') as filename:
    np.save(filename, eval_hourly)
    
print(eval_hourly.shape)

(1984, 256)


In [51]:
eval_targets = np.array(tfrecords['eval']['targets'])
path = '/home/developer/gcp/cbidmltsf/sldbs/256_to_24_eval_targets.npy'
with open(path, 'wb') as filename:
    np.save(filename, eval_targets)
    
print(eval_targets.shape)

(1984, 24)


In [52]:
test_hourly = np.array(tfrecords['test']['hourly'])
path = '/home/developer/gcp/cbidmltsf/sldbs/256_to_24_test_hourly.npy'
with open(path, 'wb') as filename:
    np.save(filename, test_hourly)
    
print(test_hourly.shape)

(1984, 256)


In [53]:
test_targets = np.array(tfrecords['test']['targets'])
path = '/home/developer/gcp/cbidmltsf/sldbs/256_to_24_test_targets.npy'
with open(path, 'wb') as filename:
    np.save(filename, test_targets)
    
print(test_targets.shape)

(1984, 24)
