In [1]:
import os

In [2]:
import json

In [3]:
import numpy as np
import pandas as pd

In [4]:
import tensorflow as tf

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [6]:
from sklearn.externals import joblib



In [7]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [8]:
data_folder = '/home/developer/gcp/cbidmltsf/datasets/electricity'

In [9]:
# constant values for positional encodings
hours_in_day = 24
days_in_week = 7
days_in_month = 30
days_in_year = 365
# weeks_of_year and month_of_year become redundant when using days_of_year, do not evaluate them
# weeks_in_year = 52
# months_in_year = 12

In [10]:
# split the time series in seen (train, eval) and unseen (test) data
# according to academic papers:

# 243 days on seen data, 7 days on unseen data 

# seen data:      '2014-01-01 00:00:00' to '2014-08-31 23:00:00', 243*24 = 5832 lectures

# train/eval split is 0.9/0.1, then

# train data:     '2014-01-01 00:00:00' to '2014-08-07 15:00:00', 5248 lectures
# eval data:      '2014-08-07 15:00:00' to '2014-08-31 23:00:00', 584 lectures

# unseen data:    '2014-09-01 00:00:00' to '2014-09-07 23:00:00', 7*24 = 168 lectures

# 243 weeks for seen data, 1 week for unseen data
no_lectures_seen_data = 243*24 # 5832

# seen data is divided as 90% for training and 10% for evaluation
train_eval_limit = 0.9

train_interval_end = int(no_lectures_seen_data*train_eval_limit) # 5248

In [11]:
# build sub-series to be persisted as serialized training examples

# dimensionality of the encoder input
m = 168

# dimensionality of the decoder output 
t = 168

span = m + t

In [12]:
# columns to be included in the SLDB
sldb_columns = [
    'date',
    'token_id',
    'kw_scaled',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    'sin_day_month',
    'cos_day_month',
    'sin_day_year',
    'cos_day_year'    
]

In [13]:
# a dictionary to store sub-series for training examples
sub_series = {
    'train': [],
    'eval': [],
    'test': []
}

In [14]:
# a dictionary to store sub-series for training examples
examples = {
    'train': [],
    'eval': [],
    'test': []
}

In [15]:
sldb = {
    'ts': 'LD2011-2014_MT320-MT330',
    'embedding': {
        'hourly': 168
    },
    'tau': {
        'hourly': 1
    },
    'no_targets': 168,
    'BSCTRFM': 1
}

In [16]:
sldb

{'ts': 'LD2011-2014_MT320-MT330',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'BSCTRFM': 1}

In [17]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier
sldb_specs = 'BSCTRFM_{:03d}_{:03d}'.format(sldb['embedding']['hourly'], sldb['no_targets'])
sldb_specs

'BSCTRFM_168_168'

In [18]:
# get the time-based identifier for the SLDB
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'LD2011-2014_MT320-MT330_BSCTRFM_168_168'

In [19]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)
sldb_dir

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT330_BSCTRFM_168_168'

In [20]:
try:
    os.mkdir(sldb_dir)
    print('Directory {} was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(sldb_dir))

Error: directory /home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT330_BSCTRFM_168_168 already exists.


In [21]:
scalers_dir = '{}/scalers'.format(sldb_dir)
scalers_dir

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT330_BSCTRFM_168_168/scalers'

In [22]:
try:
    os.mkdir(scalers_dir)
    print('Directory {} was created.'.format(scalers_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(scalers_dir))

Error: directory /home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT330_BSCTRFM_168_168/scalers already exists.


In [23]:
encoder_input_columns = [
    'kw_scaled',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    'sin_day_month',
    'cos_day_month',
    'sin_day_year',
    'cos_day_year'
]

In [24]:
# both the encoder input and the decoder input use the same columns from the source sub_series dataframe
decoder_input_columns = encoder_input_columns

In [25]:
target_columns = ['kw_scaled']

In [26]:
id_columns = ['token_id']

In [27]:
output = pd.read_pickle('{}/hourly_electricity_complete.pkl'.format(data_folder))

In [28]:
# filter to match range used by other academic papers
filtered_output = output[(output['days_from_start'] >= 1096) & (output['days_from_start'] < 1346)].copy()

In [29]:
# a dictionary to manage data per individual customer_id
data = dict()

In [30]:
# a dictionary to manage a MinMaxScaler per individual customer_id
min_max = dict()
# a dictionary to manage a StandardScaler per individual customer_id
standard = dict()

In [31]:
start, end = 320, 330

In [32]:
token_ids = [token_id for token_id in np.arange(start, end + 1)]
token_ids

[320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330]

In [33]:
customer_ids = ['MT_{:03d}'.format(token_id) for token_id in token_ids]
customer_ids

['MT_320',
 'MT_321',
 'MT_322',
 'MT_323',
 'MT_324',
 'MT_325',
 'MT_326',
 'MT_327',
 'MT_328',
 'MT_329',
 'MT_330']

### code elements for the main iterative cycle: do not run the following cells but the unified one!

In [None]:
token_id = 320

In [76]:
# get the customer identifier
customer_id = 'MT_{:03d}'.format(token_id)
customer_id

'MT_322'

In [77]:
# a temporary dataframe with data per customer_id to build the sub-series/examples
data_df = filtered_output[filtered_output['token_id'] == token_id].copy()

In [78]:
data_df

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
9146673,36.461278,322,2014-01-01 00:00:00,26304.0,1096,0,2,1,1,1,1
9146674,37.733183,322,2014-01-01 01:00:00,26305.0,1096,1,2,1,1,1,1
9146675,37.591860,322,2014-01-01 02:00:00,26306.0,1096,2,2,1,1,1,1
9146676,38.863765,322,2014-01-01 03:00:00,26307.0,1096,3,2,1,1,1,1
9146677,39.005088,322,2014-01-01 04:00:00,26308.0,1096,4,2,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
9152668,149.802148,322,2014-09-07 19:00:00,32299.0,1345,19,6,7,250,36,9
9152669,154.465800,322,2014-09-07 20:00:00,32300.0,1345,20,6,7,250,36,9
9152670,144.290560,322,2014-09-07 21:00:00,32301.0,1345,21,6,7,250,36,9
9152671,108.959864,322,2014-09-07 22:00:00,32302.0,1345,22,6,7,250,36,9


In [79]:
# expand with positional encodings
data_df['sin_hour_day'] = np.sin(2*np.pi*data_df.hour_of_day/hours_in_day)
data_df['cos_hour_day'] = np.cos(2*np.pi*data_df.hour_of_day/hours_in_day)
data_df['sin_day_week'] = np.sin(2*np.pi*data_df.day_of_week/days_in_week)
data_df['cos_day_week'] = np.cos(2*np.pi*data_df.day_of_week/days_in_week)
data_df['sin_day_month'] = np.sin(2*np.pi*data_df.day_of_month/days_in_month)
data_df['cos_day_month'] = np.cos(2*np.pi*data_df.day_of_month/days_in_month)
data_df['sin_day_year'] = np.sin(2*np.pi*data_df.day_of_year/days_in_year)
data_df['cos_day_year'] = np.cos(2*np.pi*data_df.day_of_year/days_in_year)

In [80]:
# get a series for the power usage variable on the training dataset, to fit the scaler
lectures_train_data = data_df['power_usage'][:train_interval_end]

In [81]:
# fit a scaler only on train data
# it is required to pass the power usage time series to a (?, 1) NumPy array
lectures_train_data_array = np.array(lectures_train_data).reshape(-1, 1)

In [82]:
# get MinMaxScaler on train data, store it in a dictionary
min_max_scaler = MinMaxScaler()
min_max = min_max_scaler.fit(lectures_train_data_array)

In [83]:
# persist the scaler
scaler_filename = '{}/{}_min_max.save'.format(scalers_dir, customer_id)
scaler_filename

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT330_BSCTRFM_168_168/scalers/MT_322_min_max.save'

In [84]:
joblib.dump(min_max, scaler_filename) 

['/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT330_BSCTRFM_168_168/scalers/MT_322_min_max.save']

In [85]:
# get an array from the variable time series (seen and unseen)
all_data_variable_array = np.array(data_df.power_usage).reshape(-1, 1)

In [86]:
# apply the scaler over all data (seen and unseen)
# rescale, and squeeze to drop the extra dimension, then assign to the new column kw_scaled
data_df['kw_scaled'] = np.squeeze(min_max.transform(all_data_variable_array))

In [87]:
# do not use Pandas dataframes as sub-series list contents because it overrides memory!

In [88]:
# get an iterable with all the possible sub-series for training examples
for starting_point in np.arange(train_interval_end - span + 1):
    
    sub_series_df = data_df[sldb_columns][starting_point:starting_point + span]
    
    encoder_input_df = sub_series_df[encoder_input_columns][:m]
    decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
    target_df = sub_series_df[target_columns][m:m+t]
    id_df = sub_series_df[id_columns][:1]
    
    encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
    decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
    target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
    id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()

    examples['train'].append(
        {
            'encoder_input': encoder_input_list,
            'decoder_input': decoder_input_list,
            'target': target_list,
            'id': id_list,
        }
    ) 

In [89]:
print('The number of sub-series/examples in {} dataset is {}'.\
      format('train', len(examples['train'])))

The number of sub-series/examples in train dataset is 14739


In [90]:
# get an iterable with all the possible sub-series for evaluation examples
for starting_point in np.arange(train_interval_end, no_lectures_seen_data - span + 1):
    
    sub_series_df = data_df[sldb_columns][starting_point:starting_point + span]
    
    encoder_input_df = sub_series_df[encoder_input_columns][:m]
    decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
    target_df = sub_series_df[target_columns][m:m+t]
    id_df = sub_series_df[id_columns][:1]
    
    encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
    decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
    target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
    id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()

    examples['eval'].append(
        {
            'encoder_input': encoder_input_list,
            'decoder_input': decoder_input_list,
            'target': target_list,
            'id': id_list,
        }
    ) 

In [91]:
print('The number of sub-series/examples in {} dataset is {}'.\
      format('eval', len(examples['eval'])))

The number of sub-series/examples in eval dataset is 747


In [92]:
# remember that conditional range of test dataset overlaps with evaluation dataset
# for this experiment design# get an iterable with all the possible sub-series for test examples
for starting_point in no_lectures_seen_data - span + 1 + np.arange(168):
    
    sub_series_df = data_df[sldb_columns][starting_point:starting_point + span]
    
    encoder_input_df = sub_series_df[encoder_input_columns][:m]
    decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
    target_df = sub_series_df[target_columns][m:m+t]
    id_df = sub_series_df[id_columns][:1]
    
    encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
    decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
    target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
    id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()

    examples['test'].append(
        {
            'encoder_input': encoder_input_list,
            'decoder_input': decoder_input_list,
            'target': target_list,
            'id': id_list,
        }
    ) 

In [93]:
print('The number of sub-series/examples in {} dataset is {}'.\
      format('test', len(examples['test'])))

The number of sub-series/examples in test dataset is 504


### for SLDB generation, run this unified code!

In [105]:
for token_id in token_ids:

    # get the customer identifier
    customer_id = 'MT_{:03d}'.format(token_id)
    customer_id

    # a temporary dataframe with data per customer_id to build the sub-series/examples
    data_df = filtered_output[filtered_output['token_id'] == token_id].copy()

    # expand with positional encodings
    data_df['sin_hour_day'] = np.sin(2*np.pi*data_df.hour_of_day/hours_in_day)
    data_df['cos_hour_day'] = np.cos(2*np.pi*data_df.hour_of_day/hours_in_day)
    data_df['sin_day_week'] = np.sin(2*np.pi*data_df.day_of_week/days_in_week)
    data_df['cos_day_week'] = np.cos(2*np.pi*data_df.day_of_week/days_in_week)
    data_df['sin_day_month'] = np.sin(2*np.pi*data_df.day_of_month/days_in_month)
    data_df['cos_day_month'] = np.cos(2*np.pi*data_df.day_of_month/days_in_month)
    data_df['sin_day_year'] = np.sin(2*np.pi*data_df.day_of_year/days_in_year)
    data_df['cos_day_year'] = np.cos(2*np.pi*data_df.day_of_year/days_in_year)

    # get a series for the power usage variable on the training dataset, to fit the scaler
    lectures_train_data = data_df['power_usage'][:train_interval_end]

    # fit a scaler only on train data
    # it is required to pass the power usage time series to a (?, 1) NumPy array
    lectures_train_data_array = np.array(lectures_train_data).reshape(-1, 1)

    # get MinMaxScaler on train data, store it in a dictionary
    min_max_scaler = MinMaxScaler()
    min_max = min_max_scaler.fit(lectures_train_data_array)

    # persist the scaler
    scaler_filename = '{}/{}_min_max.save'.format(scalers_dir, customer_id)
    joblib.dump(min_max, scaler_filename)
    
    # get an array from the variable time series (seen and unseen)
    all_data_variable_array = np.array(data_df.power_usage).reshape(-1, 1)

    # apply the scaler over all data (seen and unseen)
    # rescale, and squeeze to drop the extra dimension, then assign to the new column kw_scaled
    data_df['kw_scaled'] = np.squeeze(min_max.transform(all_data_variable_array))

    # get an iterable with all the possible sub-series for training examples
    for starting_point in np.arange(train_interval_end - span + 1):

        sub_series_df = data_df[sldb_columns][starting_point:starting_point + span]

        encoder_input_df = sub_series_df[encoder_input_columns][:m]
        decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
        target_df = sub_series_df[target_columns][m:m+t]
        id_df = sub_series_df[id_columns][:1]

        encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
        id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()

        examples['train'].append(
            {
                'encoder_input': encoder_input_list,
                'decoder_input': decoder_input_list,
                'target': target_list,
                'id': id_list,
            }
        )
    
    print('{} processed. The number of examples in {} dataset is {}'.\
          format(customer_id, 'train', len(examples['train'])))
    

    # get an iterable with all the possible sub-series for evaluation examples
    for starting_point in np.arange(train_interval_end, no_lectures_seen_data - span + 1):

        sub_series_df = data_df[sldb_columns][starting_point:starting_point + span]

        encoder_input_df = sub_series_df[encoder_input_columns][:m]
        decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
        target_df = sub_series_df[target_columns][m:m+t]
        id_df = sub_series_df[id_columns][:1]

        encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
        id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()

        examples['eval'].append(
            {
                'encoder_input': encoder_input_list,
                'decoder_input': decoder_input_list,
                'target': target_list,
                'id': id_list,
            }
        )
    
    print('{} processed. The number of examples in {} dataset is {}'.\
          format(customer_id, 'eval', len(examples['eval'])))
    

    # remember that conditional range of test dataset overlaps with evaluation dataset
    # for this experiment design# get an iterable with all the possible sub-series for test examples
    for starting_point in no_lectures_seen_data - span + 1 + np.arange(168):

        sub_series_df = data_df[sldb_columns][starting_point:starting_point + span]

        encoder_input_df = sub_series_df[encoder_input_columns][:m]
        decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
        target_df = sub_series_df[target_columns][m:m+t]
        id_df = sub_series_df[id_columns][:1]

        encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
        id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()

        examples['test'].append(
            {
                'encoder_input': encoder_input_list,
                'decoder_input': decoder_input_list,
                'target': target_list,
                'id': id_list,
            }
        )
    
    print('{} processed. The number of examples in {} dataset is {}'.\
          format(customer_id, 'test', len(examples['test'])))


MT_320 processed. The number of examples in train dataset is 19652
MT_320 processed. The number of examples in eval dataset is 996
MT_320 processed. The number of examples in test dataset is 672


KeyboardInterrupt: 

In [95]:
# expand the sldb dictionary with final statistics

In [98]:
sldb['stats'] = {
    'train': {
        'n_rows': len(examples['train'])
    },
    'eval': {
        'n_rows': len(examples['eval'])
    },
    'test': {
        'n_rows': len(examples['test'])
    }
}

In [99]:
sldb

{'ts': 'LD2011-2014_MT320-MT330',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'BSCTRFM': 1,
 'stats': {'train': {'n_rows': 14739},
  'eval': {'n_rows': 747},
  'test': {'n_rows': 504}}}

In [64]:
# serialize the sldb from the examples dictionary (keys are stages, values are lists of rows)

In [100]:
for stage in ['train', 'eval', 'test']:
    N_ROWS = sldb['stats'][stage]['n_rows']
    filename = '{}/{}.tfrecord'.format(sldb_dir, stage)

    with tf.io.TFRecordWriter(filename) as writer:
        for row in np.arange(N_ROWS):
            
            example = tf.train.Example(
                # features within the example
                features=tf.train.Features(
                    # individual feature definition
                    feature={'encoder_input':
                             _float_feature_from_list_of_values(
                                 examples[stage][row]['encoder_input']),
                             'decoder_input':
                             _float_feature_from_list_of_values(
                                 examples[stage][row]['decoder_input']),
                             'target':
                             _float_feature_from_list_of_values(
                                 examples[stage][row]['target']),
                             'id':
                             _float_feature_from_list_of_values(
                                 examples[stage][row]['id'])
                             }
                )
            )
            serialized_example = example.SerializeToString()
            writer.write(serialized_example)

In [101]:
json_filename = '{}/sldb.json'.format(sldb_dir)

In [102]:
with open(json_filename, 'w') as filename:
    json.dump(sldb, filename, indent=4)

### read TFRecord file into a Dataset and confirm the values in given rows against the source dataframe!!!

In [68]:
# do not forget to sync sldbs/ from local to GS after the previous operations!
!gsutil rsync -d -r /home/developer/gcp/cbidmltsf/sldbs gs://cbidmltsf/sldbs

Building synchronization state...
Starting synchronization...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT321_BSCTRFM_168_168/eval.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT321_BSCTRFM_168_168/sldb.json [Content-Type=application/json]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT321_BSCTRFM_168_168/test.tfrecord [Content-Type=application/octet-stream]...
Copying file:///home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT321_BSCTRFM_168_168/train.tfrecord [Content-Type=application/octet-stream]...
- [4 files][130.9 MiB/130.9 MiB]    4.2 MiB/s                                   
Operation completed over 4 objects/130.9 MiB.                                    


In [None]:
# ToDo: random sample the serialization of examples to TFRecord SLDB!!!