In [1]:
import os

In [2]:
import json

In [3]:
import pandas as pd

In [4]:
import numpy as np

In [5]:
import tensorflow as tf

In [6]:
tf.__version__

'2.4.1'

In [7]:
# define a identifier string to access to the preprocessed time series
identifier = 'CPE04115_H_kw_20210526212214'

In [8]:
# build the time series directory
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(identifier)

In [15]:
stages = ['train', 'eval', 'test']

In [16]:
# pass time series for ML stages to a dict
ts = dict()

In [17]:
for stage in stages:
    ts[stage] = pd.read_pickle('{}/ts_{}.pkl'.format(time_series_folder, stage))

In [18]:
# confirm the time series are now in the dictionary
for stage in stages:
    print('{} time series loaded with {} rows'.format(stage, ts[stage]['kw_scaled'].count()))

train time series loaded with 17542 rows
eval time series loaded with 2879 rows
test time series loaded with 2208 rows


In [19]:
# a dictionary to configure and describe the SLDB
# add a boolean (binary) to state this SLDB is only functional to the transformer architecture

sldb = {
    'ts': 'CPE04115_H_kw_20210526212214',
    'embedding': {
        'hourly': 168
    },
    'tau': {
        'hourly': 1
    },
    'no_targets': 168,
    'BSCTRFM': 1
}

### building positional encodings

#### our template

In [20]:
hours_in_day = 24
days_in_week = 7
weeks_in_year = 53

# build positional encodings for time series in all stages
for stage in ['train', 'eval', 'test']:

    # hour of the day: 0-23
    timestamp_hour_day = np.array(ts[stage].index.hour)
    # day of the week: 0-6
    day_week_list = [timestamp.weekday() for timestamp in ts[stage].index]
    timestamp_day_week = np.array(day_week_list)
    # pd.timestamp.week values go from 1 to 53
    # adjust them to 0-52
    week_values = ts[stage].index.week - 1
    timestamp_week_year = np.array(week_values)

    # build arrays with positional encoding components and cast them to float32
    sin_hour_day = np.sin(2*np.pi*timestamp_hour_day/hours_in_day).astype(np.float32)
    cos_hour_day = np.cos(2*np.pi*timestamp_hour_day/hours_in_day).astype(np.float32)

    sin_day_week = np.sin(2*np.pi*timestamp_day_week/days_in_week).astype(np.float32)
    cos_day_week = np.cos(2*np.pi*timestamp_day_week/days_in_week).astype(np.float32)

    sin_week_year = np.sin(2*np.pi*timestamp_week_year/weeks_in_year).astype(np.float32)
    cos_week_year = np.cos(2*np.pi*timestamp_week_year/weeks_in_year).astype(np.float32)

    # now expand the time series dataframe with positional encoding components
    # pass the pos encoding arrays to dataframe as lists
    ts[stage]['sin_hour_day'] = list(sin_hour_day)
    ts[stage]['cos_hour_day'] = list(cos_hour_day)
    ts[stage]['sin_day_week'] = list(sin_day_week)
    ts[stage]['cos_day_week'] = list(cos_day_week)
    ts[stage]['sin_week_year'] = list(sin_week_year)
    ts[stage]['cos_week_year'] = list(cos_week_year)

In [21]:
# report results
for stage in ['train', 'eval', 'test']:
    print('{} lectures in {} time series from {} to {}'.format(ts[stage].count()[0],
                                                               stage,
                                                               ts[stage].index[0],
                                                               ts[stage].index[-1]))

17542 lectures in train time series from 2016-01-01 00:00:00 to 2017-12-31 23:00:00
2879 lectures in eval time series from 2018-01-01 00:00:00 to 2018-04-30 23:00:00
2208 lectures in test time series from 2018-05-01 00:00:00 to 2018-07-31 23:00:00


In [22]:
ts['train']

Unnamed: 0_level_0,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_week_year,cos_week_year
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01 00:00:00,0.274317,0.000000,1.000000,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 01:00:00,0.217363,0.258819,0.965926,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 02:00:00,0.168545,0.500000,0.866025,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 03:00:00,0.122996,0.707107,0.707107,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 04:00:00,0.080440,0.866025,0.500000,-0.433884,-0.900969,-0.118273,0.992981
...,...,...,...,...,...,...,...
2017-12-31 19:00:00,0.542273,-0.965926,0.258819,-0.781832,0.623490,-0.234886,0.972023
2017-12-31 20:00:00,0.478005,-0.866025,0.500000,-0.781832,0.623490,-0.234886,0.972023
2017-12-31 21:00:00,0.414886,-0.707107,0.707107,-0.781832,0.623490,-0.234886,0.972023
2017-12-31 22:00:00,0.358717,-0.500000,0.866025,-0.781832,0.623490,-0.234886,0.972023


### review how the SLDB rows are built from time series (values and pos encodings)

In [23]:
m, t = sldb['embedding']['hourly'], sldb['no_targets']
m, t

(168, 168)

In [24]:
stage = 'train'

In [25]:
start_index = 0

In [26]:
end_index = start_index + (m+t)

In [27]:
sub_series = ts[stage][start_index:end_index]
sub_series

Unnamed: 0_level_0,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_week_year,cos_week_year
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01 00:00:00,0.274317,0.000000,1.000000,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 01:00:00,0.217363,0.258819,0.965926,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 02:00:00,0.168545,0.500000,0.866025,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 03:00:00,0.122996,0.707107,0.707107,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 04:00:00,0.080440,0.866025,0.500000,-0.433884,-0.900969,-0.118273,0.992981
...,...,...,...,...,...,...,...
2016-01-14 19:00:00,0.661054,-0.965926,0.258819,0.433884,-0.900969,0.118273,0.992981
2016-01-14 20:00:00,0.633207,-0.866025,0.500000,0.433884,-0.900969,0.118273,0.992981
2016-01-14 21:00:00,0.599071,-0.707107,0.707107,0.433884,-0.900969,0.118273,0.992981
2016-01-14 22:00:00,0.495604,-0.500000,0.866025,0.433884,-0.900969,0.118273,0.992981


In [28]:
# the encoder input
sub_series[:m]

Unnamed: 0_level_0,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_week_year,cos_week_year
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01 00:00:00,0.274317,0.000000,1.000000,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 01:00:00,0.217363,0.258819,0.965926,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 02:00:00,0.168545,0.500000,0.866025,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 03:00:00,0.122996,0.707107,0.707107,-0.433884,-0.900969,-0.118273,0.992981
2016-01-01 04:00:00,0.080440,0.866025,0.500000,-0.433884,-0.900969,-0.118273,0.992981
...,...,...,...,...,...,...,...
2016-01-07 19:00:00,0.659295,-0.965926,0.258819,0.433884,-0.900969,0.000000,1.000000
2016-01-07 20:00:00,0.616653,-0.866025,0.500000,0.433884,-0.900969,0.000000,1.000000
2016-01-07 21:00:00,0.577855,-0.707107,0.707107,0.433884,-0.900969,0.000000,1.000000
2016-01-07 22:00:00,0.468685,-0.500000,0.866025,0.433884,-0.900969,0.000000,1.000000


In [29]:
# the decoder input ERROR? does it have to be defined in terms of t instead of m?
# no, it does not
# as long as the subseries formation already uses t to define the end_index
sub_series[m-1:-1]

Unnamed: 0_level_0,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_week_year,cos_week_year
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-07 23:00:00,0.317259,-0.258819,9.659258e-01,0.433884,-0.900969,0.000000,1.000000
2016-01-08 00:00:00,0.188947,0.000000,1.000000e+00,-0.433884,-0.900969,0.000000,1.000000
2016-01-08 01:00:00,0.114106,0.258819,9.659258e-01,-0.433884,-0.900969,0.000000,1.000000
2016-01-08 02:00:00,0.070324,0.500000,8.660254e-01,-0.433884,-0.900969,0.000000,1.000000
2016-01-08 03:00:00,0.065130,0.707107,7.071068e-01,-0.433884,-0.900969,0.000000,1.000000
...,...,...,...,...,...,...,...
2016-01-14 18:00:00,0.487971,-1.000000,-1.836970e-16,0.433884,-0.900969,0.118273,0.992981
2016-01-14 19:00:00,0.661054,-0.965926,2.588190e-01,0.433884,-0.900969,0.118273,0.992981
2016-01-14 20:00:00,0.633207,-0.866025,5.000000e-01,0.433884,-0.900969,0.118273,0.992981
2016-01-14 21:00:00,0.599071,-0.707107,7.071068e-01,0.433884,-0.900969,0.118273,0.992981


In [30]:
# the target
sub_series[m:]

Unnamed: 0_level_0,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_week_year,cos_week_year
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-08 00:00:00,0.188947,0.000000,1.000000,-0.433884,-0.900969,0.000000,1.000000
2016-01-08 01:00:00,0.114106,0.258819,0.965926,-0.433884,-0.900969,0.000000,1.000000
2016-01-08 02:00:00,0.070324,0.500000,0.866025,-0.433884,-0.900969,0.000000,1.000000
2016-01-08 03:00:00,0.065130,0.707107,0.707107,-0.433884,-0.900969,0.000000,1.000000
2016-01-08 04:00:00,0.053664,0.866025,0.500000,-0.433884,-0.900969,0.000000,1.000000
...,...,...,...,...,...,...,...
2016-01-14 19:00:00,0.661054,-0.965926,0.258819,0.433884,-0.900969,0.118273,0.992981
2016-01-14 20:00:00,0.633207,-0.866025,0.500000,0.433884,-0.900969,0.118273,0.992981
2016-01-14 21:00:00,0.599071,-0.707107,0.707107,0.433884,-0.900969,0.118273,0.992981
2016-01-14 22:00:00,0.495604,-0.500000,0.866025,0.433884,-0.900969,0.118273,0.992981


In [31]:
# then the SLDB is consistent with the Transformer definition

In [32]:
# initialize a sub-dictionary for SLDB stats
sldb['stats'] = dict()

### remove timestamps from SLDB, they are available at inference via time series

In [49]:
# a dictionary to store row arrays lists for all the stages
results = dict()

# pass the embedding dimension of the encoder
m = sldb['embedding']['hourly']

# pass the embedding dimension of the decoder (it is also the number of targets)
t = sldb['no_targets']

# iterate on stages
for stage in ['train', 'eval', 'test']:
    
    # use a temporary list for each SLDB feature (column)
    encoder_input_arrays_list = list()
    decoder_input_arrays_list = list()
    target_arrays_list = list()
    # encoder_input_timestamp_arrays_list = list()
    # target_timestamp_arrays_list = list()
    
    # slide a window to form a row array with m + t elements of the time series
    # row_array[:m] is the encoder input, shape (m, 7)
    # row_array[m-1:-1] is the decoder input, shape (t, 7)
    # row_array[m:, :1] is the target, shape (t, 1)
    
    for start_index in range(ts[stage].shape[0] - (m+t) + 1):
        # start_index, end_index are the indexes in the dataframe that define
        # the sub-series in the sliding window
        end_index = start_index + (m+t)
        # get the time window sub-series
        sub_series = ts[stage][start_index: end_index]

        # pass the sub-series to a NumPy array
        # discard the timestamp index when passing values
        row_array = sub_series.reset_index(drop=True).to_numpy()
        # and pass the timestamp index as string        
        timestamps_array = sub_series.index.strftime("%Y-%m-%d %H:%M:%S")        
        
        # build SLDB columns from the row array and the timestamp array
        encoder_input_arrays_list.append(row_array[:m])                             # shape is [m, 7]
        decoder_input_arrays_list.append(row_array[m-1:-1])                         # shape is [t, 7]
        target_arrays_list.append(row_array[m:, :1])                                # shape is [t, 1]
        # encoder_input_timestamp_arrays_list.append(timestamps_array[:m])            # shape is [m, 1]
        # target_timestamp_arrays_list.append(timestamps_array[m:])                   # shape is [t, 1]
        
    # report stage completion
    print('Dataset created for {} stage with {} encoder input rows.'
          .format(stage, len(encoder_input_arrays_list)))
    
    print('Dataset created for {} stage with {} decoder input rows.'
          .format(stage, len(decoder_input_arrays_list)))
    
    print('Dataset created for {} stage with {} target rows.'
          .format(stage, len(target_arrays_list)))
    
    # print('Dataset created for {} stage with {} encoder input timestamps.'
    #       .format(stage, len(encoder_input_timestamp_arrays_list)))
    
    # print('Dataset created for {} stage with {} target timestamps.'
    #       .format(stage, len(target_timestamp_arrays_list)))
    
    
    # add a sub-dictionary for stage results
    results[stage] = dict()

    results[stage]['encoder_input'] = encoder_input_arrays_list
    results[stage]['decoder_input'] = decoder_input_arrays_list
    results[stage]['target'] = target_arrays_list
    # results[stage]['encoder_input_timestamps'] = encoder_input_timestamp_arrays_list
    # results[stage]['target_timestamps'] = target_timestamp_arrays_list

    # add a sub-dictionary for stage stats
    sldb['stats'][stage] = dict()
    # pass number of rows to SLDB statistics dictionary
    
    # the number of rows in the source list (any SLDB column works to get the number of rows)
    sldb['stats'][stage]['n_rows'] = len(results[stage]['encoder_input'])

Dataset created for train stage with 17207 encoder input rows.
Dataset created for train stage with 17207 decoder input rows.
Dataset created for train stage with 17207 target rows.
Dataset created for eval stage with 2544 encoder input rows.
Dataset created for eval stage with 2544 decoder input rows.
Dataset created for eval stage with 2544 target rows.
Dataset created for test stage with 1873 encoder input rows.
Dataset created for test stage with 1873 decoder input rows.
Dataset created for test stage with 1873 target rows.


In [50]:
# ToDo: code to calculate sldb[stats]

In [51]:
# the final sldb dictionary
sldb

{'ts': 'CPE04115_H_kw_20210526212214',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'BSCTRFM': 1,
 'stats': {'train': {'n_rows': 17207},
  'eval': {'n_rows': 2544},
  'test': {'n_rows': 1873}}}

In [52]:
# verify some SLDB columns before serializing to TFRecord files
results['test']['encoder_input'][0].shape

(168, 7)

In [53]:
results['test']['decoder_input'][0].shape

(168, 7)

In [54]:
results['test']['target'][0].shape

(168, 1)

In [55]:
# results['test']['encoder_input_timestamps'][0].shape

In [56]:
# results['test']['target_timestamps'][0].shape

In [57]:
# results['test']['encoder_input_timestamps'][0][0], results['test']['encoder_input_timestamps'][0][-1]

In [58]:
# results['test']['target_timestamps'][0][0], results['test']['target_timestamps'][0][-1]

In [59]:
# use list comprehension to encode the timestamp array (Index dtype) into a list of bytes
# [timestamp.encode() for timestamp in results['test']['timestamp'][0]]

In [60]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [61]:
# a function to encode byte values for serialized examples
def _bytes_feature_from_list_of_values(list_of_values):
    """Returns a bytes_list from a list of strings / bytes."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_values))

In [62]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier
sldb_specs = 'BSCTRFM_{:03d}_{:03d}'.format(sldb['embedding']['hourly'], sldb['no_targets'])
sldb_specs

'BSCTRFM_168_168'

In [63]:
# get the time-based identifer for the SLDB
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'CPE04115_H_kw_20210526212214_BSCTRFM_168_168'

In [64]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)
sldb_dir

'/home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_BSCTRFM_168_168'

In [65]:
try:
    os.mkdir(sldb_dir)
    print('Directory {} was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(sldb_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_BSCTRFM_168_168 was created.


In [66]:
for stage in stages:
    N_ROWS = sldb['stats'][stage]['n_rows']
    filename = '{}/{}.tfrecord'.format(sldb_dir, stage)

    with tf.io.TFRecordWriter(filename) as writer:
        for row in np.arange(N_ROWS):
            example = tf.train.Example(
                # features within the example
                features=tf.train.Features(
                    # individual feature definition
                    feature={'encoder_input':
                             _float_feature_from_list_of_values(
                                 results[stage]['encoder_input'][row].flatten()),
                             'decoder_input':
                             _float_feature_from_list_of_values(
                                 results[stage]['decoder_input'][row].flatten()),
                             'target':
                             _float_feature_from_list_of_values(
                                 results[stage]['target'][row].flatten())
                             
                             # use list comprehension to encode the timestamp array (dtype=Index)
                             # into a list of bytes
                             # 'encoder_input_timestamps':
                             # _bytes_feature_from_list_of_values(
                             #     [timestamp.encode() for timestamp in results[stage]['encoder_input_timestamps'][row]]),
                             # 'target_timestamps':
                             # _bytes_feature_from_list_of_values(
                             #     [timestamp.encode() for timestamp in results[stage]['target_timestamps'][row]])
                             }
                )
            )
            serialized_example = example.SerializeToString()
            writer.write(serialized_example)

In [67]:
json_filename = '{}/sldb.json'.format(sldb_dir)

In [68]:
with open(json_filename, 'w') as filename:
    json.dump(sldb, filename, indent=4)

In [69]:
# do not forget to sync sldbs/ from local to GS after the previous operations!
!gsutil rsync -d -r /home/developer/gcp/cbidmltsf/sldbs gs://cbidmltsf/sldbs

Building synchronization state...
Starting synchronization...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_BSCTRFM_168_168/eval.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_BSCTRFM_168_168/sldb.json [Content-Type=application/json]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_BSCTRFM_168_168/test.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20210526212214_BSCTRFM_168_168/train.tfrecord [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `compo