In [1]:
import numpy as np
import pandas as pd

In [2]:
import json

In [3]:
import os

In [4]:
import tensorflow as tf

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import joblib

In [6]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.layouts import row, gridplot, layout
from bokeh.palettes import d3
output_notebook()

In [7]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [8]:
# the main source is the electricity dataset LD2011-2014 from UCI

In [9]:
# it resides in
dataset_path = '/home/developer/gcp/cbidmltsf/datasets/electricity'

In [10]:
os.listdir(dataset_path)

['LD2011_2014.txt',
 'separated_preprocessed',
 'separated_raw',
 'hourly_electricity_complete.pkl',
 'hourly_electricity.csv',
 'LD2011_2014.txt.zip',
 'hourly_electricity_filtered_academic_papers.pkl']

In [11]:
# 'LD2011_2014.txt'                                          source from UCI
# 'LD2011_2014.txt.zip'                                      source from UCI, compressed
# 'hourly_electricity.csv'                                   complete dataset in CSV
# 'hourly_electricity_complete.pkl'                          complete dataset in Pandas
# 'hourly_electricity_filtered_academic_papers.pkl'          filtered dataset for benchmarking
# 'separated_raw/'                                           pickles per customer, raw data
# 'separated_preprocessed/'                                  pickles per customer, outliers removed

In [12]:
# a SLDB is produced from separated time series (raw or preprocessed)

# SLDB contents are:
# TFRecord files for training
# TFRecord files for evaluation (if eval required)
# time series pickles for testing

In [13]:
# constant values for positional encodings
hours_in_day = 24
days_in_week = 7
days_in_month = 30
days_in_year = 365

In [14]:
# a constant to make sin/cos functions from hours_from_start (the 'age' covariate)
total_hours = 32303

In [15]:
# split the time series in seen (train, eval) and unseen (test) data
# according to academic papers:

# 243 days on seen data, 7 days on unseen data 

# seen data:      '2014-01-01 00:00:00' to '2014-08-31 23:00:00', 243*24 = 5832 lectures

# train/eval split is 0.9/0.1, then

# train data:     '2014-01-01 00:00:00' to '2014-08-07 15:00:00', 5248 lectures
# eval data:      '2014-08-07 16:00:00' to '2014-08-31 23:00:00', 584 lectures

# unseen data:    '2014-09-01 00:00:00' to '2014-09-07 23:00:00', 7*24 = 168 lectures

# 243 weeks for seen data, 1 week for unseen data
no_lectures_seen_data = 243*24 # 5832

# seen data is divided as 90% for training and 10% for evaluation
train_eval_limit = 0.9

train_interval_end = int(no_lectures_seen_data*train_eval_limit) # 5248

In [16]:
# build sub-series to be persisted as serialized training examples

# dimensionality of the encoder input
m = 168

# dimensionality of the decoder output 
t = 168

In [17]:
# columns to be included in the SLDB

# use 7D encoder (age, hour-day, day-week)

sldb_columns = [
    'date',
    'token_id',
    'kw_scaled',
    'sin_hours_from_start',
    'cos_hours_from_start',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    # 'sin_day_month',
    # 'cos_day_month',
    # 'sin_day_year',
    # 'cos_day_year'    
]

In [20]:
sldb = {
    'ts': 'LD2011-2014_SEPARATED_FULL',
    'embedding': {
        'hourly': 168
    },
    'tau': {
        'hourly': 1
    },
    'no_targets': 168,
    'BSCTRFM': 1,
    'preprocessed': 0
}

In [21]:
sldb

{'ts': 'LD2011-2014_SEPARATED_FULL',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'BSCTRFM': 1,
 'preprocessed': 0}

In [22]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier

# add the suffix '11D' to differentiate this SLDB from the original one, which is 9D

# add the suffix MMX to indicate the scaler used was MinMax
# add the suffix STD to indicate the scaler used was Standard

sldb_specs = 'BSCTRFM_{:03d}_{:03d}_07DB_MMX'.format(sldb['embedding']['hourly'], sldb['no_targets'])
sldb_specs

'BSCTRFM_168_168_07DB_MMX'

In [23]:
# get the time-based identifier for the SLDB
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'

In [24]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)
sldb_dir

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'

In [22]:
# make a directory for the complete SLDB
try:
    os.mkdir(sldb_dir)
    print('Directory {} was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(sldb_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX was created.


In [23]:
# make a sub-directory for the training TFRecord files
try:
    os.mkdir('{}/train'.format(sldb_dir))
    print('Directory {}/train was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {}/train already exists.'.format(sldb_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/train was created.


In [24]:
# make a sub-directory for the evaluation TFRecord files
try:
    os.mkdir('{}/eval'.format(sldb_dir))
    print('Directory {}/eval was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {}/eval already exists.'.format(sldb_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/eval was created.


In [25]:
# make a sub-directory to persist time series pickles used for inference
try:
    os.mkdir('{}/test'.format(sldb_dir))
    print('Directory {}/test was created.'.format(sldb_dir))
except FileExistsError:
    print('Error: directory {}/test already exists.'.format(sldb_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/test was created.


In [25]:
# get a path to the scalers sub-directory
scalers_dir = '{}/scalers'.format(sldb_dir)
scalers_dir

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/scalers'

In [27]:
# make a sub-directory for the scalers
try:
    os.mkdir(scalers_dir)
    print('Directory {} was created.'.format(scalers_dir))
except FileExistsError:
    print('Error: directory {} already exists.'.format(scalers_dir))

Directory /home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/scalers was created.


In [26]:
encoder_input_columns = [
    'kw_scaled',
    'sin_hours_from_start',
    'cos_hours_from_start',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    # 'sin_day_month',
    # 'cos_day_month',
    # 'sin_day_year',
    # 'cos_day_year'
]

In [27]:
# both the encoder input and the decoder input use the same columns from the source sub_series dataframe
decoder_input_columns = encoder_input_columns

In [28]:
target_columns = ['kw_scaled']

In [29]:
id_columns = ['token_id']

In [30]:
# a dictionary to manage data per individual customer_id
data = dict()

In [31]:
# the number of cores available for training in Cloud TPU
num_cores = 8

In [32]:
start, end = 1, 370

In [33]:
token_ids = [token_id for token_id in np.arange(start, end + 1)]

In [34]:
customer_ids = ['MT_{:03d}'.format(token_id) for token_id in token_ids]

In [35]:
# are we training over raw data or preprocessed data?
state = 'raw'

In [36]:
for customer_id in customer_ids:
    customer_data_path = '{}/separated_{}/{}.pkl'.format(dataset_path, state, customer_id)
    data[customer_id] = pd.read_pickle(customer_data_path)

In [37]:
len(data.keys())

370

In [38]:
# exclude token ids from SLDB here

# 223 has no data!
excluded_token_ids = [223]

In [39]:
# there are 20 time series with a total number of lectures under the expected 6000
incomplete_time_series = [
    'MT_106', 'MT_107', 'MT_108', 'MT_109', 'MT_110', 'MT_111', 'MT_112',
    'MT_113', 'MT_115', 'MT_116', 'MT_117', 'MT_120', 'MT_121', 'MT_122',
    'MT_133', 'MT_160', 'MT_178', 'MT_181', 'MT_223', 'MT_337']

In [63]:
# for instance
customer_id = incomplete_time_series[0]
data[customer_id]

Unnamed: 0,power_usage,token_id,date,hours_from_start,days_from_start,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year
2668763,27.302944,106,2014-01-14 00:00:00,26616.0,1109,0,1,14,14,3,1
2668764,36.324786,106,2014-01-14 01:00:00,26617.0,1109,1,1,14,14,3,1
2668765,37.037037,106,2014-01-14 02:00:00,26618.0,1109,2,1,14,14,3,1
2668766,37.037037,106,2014-01-14 03:00:00,26619.0,1109,3,1,14,14,3,1
2668767,35.612536,106,2014-01-14 04:00:00,26620.0,1109,4,1,14,14,3,1
...,...,...,...,...,...,...,...,...,...,...,...
2674446,32.288699,106,2014-09-07 19:00:00,32299.0,1345,19,6,7,250,36,9
2674447,31.339031,106,2014-09-07 20:00:00,32300.0,1345,20,6,7,250,36,9
2674448,32.051282,106,2014-09-07 21:00:00,32301.0,1345,21,6,7,250,36,9
2674449,33.000950,106,2014-09-07 22:00:00,32302.0,1345,22,6,7,250,36,9


In [44]:
# locate the missing values by date

In [60]:
# first, the expected 6000 dates in a list
start_timestamp = pd.to_datetime('2014-01-01 00:00:00')
end_timestamp = pd.to_datetime('2014-09-07 23:00:00')

required_interval = pd.date_range(start=start_timestamp, end=end_timestamp, freq='H')

In [61]:
len(required_interval)

6000

In [79]:
missing_timestamps = list(set(required_interval) - set(data[customer_id]['date']))
missing_timestamps.sort()

In [82]:
len(missing_timestamps)

312

### for SLDB generation, run this unified code!

In [54]:
for token_id in [token_id for token_id in token_ids if token_id not in excluded_token_ids]:
    
    # initialize the examples dictionary for each customer
    examples = {
        'train': [],
        'eval': [],
        # test dataset is not passed to SLDB
        # 'test': []
    }

    # get the customer identifier
    customer_id = 'MT_{:03d}'.format(token_id)
    customer_id
    print('Started processing for {}'.format(customer_id))

    # a temporary dataframe with data per customer_id to build the sub-series/examples
    # data_df = filtered_output[filtered_output['token_id'] == token_id].copy()
    
    # use now a reference to the dataframe in the data dictionary 
    data_df = data[customer_id]

    # expand with positional encodings
    data_df['sin_hours_from_start'] = np.sin(2*np.pi*data_df.hours_from_start/total_hours)
    data_df['cos_hours_from_start'] = np.cos(2*np.pi*data_df.hours_from_start/total_hours)
    data_df['sin_hour_day'] = np.sin(2*np.pi*data_df.hour_of_day/hours_in_day)
    data_df['cos_hour_day'] = np.cos(2*np.pi*data_df.hour_of_day/hours_in_day)
    data_df['sin_day_week'] = np.sin(2*np.pi*data_df.day_of_week/days_in_week)
    data_df['cos_day_week'] = np.cos(2*np.pi*data_df.day_of_week/days_in_week)
    # data_df['sin_day_month'] = np.sin(2*np.pi*data_df.day_of_month/days_in_month)
    # data_df['cos_day_month'] = np.cos(2*np.pi*data_df.day_of_month/days_in_month)
    # data_df['sin_day_year'] = np.sin(2*np.pi*data_df.day_of_year/days_in_year)
    # data_df['cos_day_year'] = np.cos(2*np.pi*data_df.day_of_year/days_in_year)

    # get a series for the power usage variable on the training dataset, to fit the scaler
    lectures_train_data = data_df['power_usage'][:train_interval_end]

    # fit a scaler only on train data
    # it is required to pass the power usage time series to a (?, 1) NumPy array
    lectures_train_data_array = np.array(lectures_train_data).reshape(-1, 1)
    
    # use MinMax scaler or Standard scaler

    # get MinMaxScaler on train data, store it in a dictionary
    scaler_type = 'min_max'
    scaler = MinMaxScaler()
    fitted_scaler = scaler.fit(lectures_train_data_array)
    print('Scaler {} generated on training data for {}'.format(scaler_type, customer_id))

    # persist the scaler
    scaler_filename = '{}/{}_{}.save'.format(scalers_dir, scaler_type, customer_id)
    joblib.dump(fitted_scaler, scaler_filename)
    print('Scaler {} persisted for {}'.format(scaler_type, customer_id))
    
    '''
    # get Standard on train data, store it in a dictionary
    scaler_type = 'standard'
    scaler = StandardScaler()
    fitted_scaler = scaler.fit(lectures_train_data_array)
    print('Scaler {} generated on training data for {}'.format(scaler_type, customer_id))

    # persist the scaler
    scaler_filename = '{}/{}_{}.save'.format(scalers_dir, scaler_type, customer_id)
    joblib.dump(fitted_scaler, scaler_filename)
    print('Scaler {} persisted for {}'.format(scaler_type, customer_id))
    '''
        
    # get an array from the variable time series (seen and unseen)
    all_data_variable_array = np.array(data_df.power_usage).reshape(-1, 1)

    # apply the scaler over all data (seen and unseen)
    # rescale, and squeeze to drop the extra dimension, then assign to the new column kw_scaled
    data_df['kw_scaled'] = np.squeeze(fitted_scaler.transform(all_data_variable_array))
    
    # at this moment, the individual time series is ready to be window-rolled to produce
    # sub-series/examples to serialize
    
    # BSCTRFM inference process is not direct, but iterative, therefoer
    # no TFRecord SLDB is required for test dataset,
    
    # persist only the time series corresponding to the inference interval as test dataset
    test_time_series = data_df[sldb_columns][no_lectures_seen_data - (m + t) + 1:]
    
    # path to persist the time series dataframe corresponding to test dataset
    path = '{}/test/{}.pkl'.format(sldb_dir, customer_id)
    
    test_time_series.to_pickle(path)
    print('Test dataset persisted as a time series pickle for {}'.format(customer_id))
    
    
    # get an iterable with all the possible sub-series for training examples
    for starting_point in np.arange(train_interval_end - (m + t) + 1):

        sub_series_df = data_df[sldb_columns][starting_point:starting_point + (m + t)]

        encoder_input_df = sub_series_df[encoder_input_columns][:m]
        decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
        target_df = sub_series_df[target_columns][m:m+t]
        id_df = sub_series_df[id_columns][:1]

        encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
        id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()

        examples['train'].append(
            {
                'encoder_input': encoder_input_list,
                'decoder_input': decoder_input_list,
                'target': target_list,
                'id': id_list,
            }
        )

    print('{} processed. The number of examples in {} dataset is {}'.\
          format(customer_id, 'train', len(examples['train'])))


    # ToDo: remove evaluation step from Cloud TPU training and use all seen data for training stage
    build_eval_set = True

    if build_eval_set:

        # get an iterable with all the possible sub-series for evaluation examples
        for starting_point in np.arange(train_interval_end, no_lectures_seen_data - (m + t) + 1):

            sub_series_df = data_df[sldb_columns][starting_point:starting_point + (m + t)]

            encoder_input_df = sub_series_df[encoder_input_columns][:m]
            decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
            target_df = sub_series_df[target_columns][m:m+t]
            id_df = sub_series_df[id_columns][:1]

            encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
            decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
            target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
            id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()

            examples['eval'].append(
                {
                    'encoder_input': encoder_input_list,
                    'decoder_input': decoder_input_list,
                    'target': target_list,
                    'id': id_list,
                }
            )

        print('{} processed. The number of examples in {} dataset is {}'.\
              format(customer_id, 'eval', len(examples['eval'])))


    # DO NOT PRODUCE A TEST DATASET FOR SLDB, AS INFERENCE PROCESS IS NOT DIRECT
    # (IT IS ITERATIVE OVER UNSEEN DATA TIME SERIES)

    # on each customer dataset, adjust the number of examples to the number of training cores
    for stage in ['train', 'eval']:
        # how many examples/rows must be removed from examples[stage] to comply with the number of cores
        examples_to_remove = len(examples[stage])%num_cores

        # remove the last 'examples_to_remove' examples from the dataset
        for _ in np.arange(examples_to_remove):
            examples[stage].pop(-1)

        print('For {} cores in Cloud TPU, the number of {} examples for {} was adjusted to {}'.\
             format(num_cores, stage, customer_id, len(examples[stage])))


    # serialize the rows in examples['train'] and, if present, examples['eval']
    # to avoid excesive memory consumption

    # write a TFRecord file for each consumer_id/stage
    for stage in ['train', 'eval']:
        # N_ROWS = sldb['stats'][stage]['n_rows']
        N_ROWS = len(examples[stage])
        filename = '{}/{}/{}.tfrecord'.format(sldb_dir, stage, customer_id)

        with tf.io.TFRecordWriter(filename) as writer:
            for row in np.arange(N_ROWS):

                example = tf.train.Example(
                    # features within the example
                    features=tf.train.Features(
                        # individual feature definition
                        feature={'encoder_input':
                                 _float_feature_from_list_of_values(
                                     examples[stage][row]['encoder_input']),
                                 'decoder_input':
                                 _float_feature_from_list_of_values(
                                     examples[stage][row]['decoder_input']),
                                 'target':
                                 _float_feature_from_list_of_values(
                                     examples[stage][row]['target']),
                                 'id':
                                 _float_feature_from_list_of_values(
                                     examples[stage][row]['id'])
                                 }
                    )
                )
                serialized_example = example.SerializeToString()
                writer.write(serialized_example)

            # report TFRecord file as completed
            print('Persisted {} TFRecord file for {}'.format(stage, customer_id))

Started processing for MT_001
Scaler min_max generated on training data for MT_001
Scaler min_max persisted for MT_001
Test dataset persisted as a time series pickle for MT_001
MT_001 processed. The number of examples in train dataset is 4913
MT_001 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_001 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_001 was adjusted to 248
Persisted train TFRecord file for MT_001
Persisted eval TFRecord file for MT_001
Started processing for MT_002
Scaler min_max generated on training data for MT_002
Scaler min_max persisted for MT_002
Test dataset persisted as a time series pickle for MT_002
MT_002 processed. The number of examples in train dataset is 4913
MT_002 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_002 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval e

Persisted train TFRecord file for MT_015
Persisted eval TFRecord file for MT_015
Started processing for MT_016
Scaler min_max generated on training data for MT_016
Scaler min_max persisted for MT_016
Test dataset persisted as a time series pickle for MT_016
MT_016 processed. The number of examples in train dataset is 4913
MT_016 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_016 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_016 was adjusted to 248
Persisted train TFRecord file for MT_016
Persisted eval TFRecord file for MT_016
Started processing for MT_017
Scaler min_max generated on training data for MT_017
Scaler min_max persisted for MT_017
Test dataset persisted as a time series pickle for MT_017
MT_017 processed. The number of examples in train dataset is 4913
MT_017 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_030
Persisted eval TFRecord file for MT_030
Started processing for MT_031
Scaler min_max generated on training data for MT_031
Scaler min_max persisted for MT_031
Test dataset persisted as a time series pickle for MT_031
MT_031 processed. The number of examples in train dataset is 4913
MT_031 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_031 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_031 was adjusted to 248
Persisted train TFRecord file for MT_031
Persisted eval TFRecord file for MT_031
Started processing for MT_032
Scaler min_max generated on training data for MT_032
Scaler min_max persisted for MT_032
Test dataset persisted as a time series pickle for MT_032
MT_032 processed. The number of examples in train dataset is 4913
MT_032 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_045
Persisted eval TFRecord file for MT_045
Started processing for MT_046
Scaler min_max generated on training data for MT_046
Scaler min_max persisted for MT_046
Test dataset persisted as a time series pickle for MT_046
MT_046 processed. The number of examples in train dataset is 4913
MT_046 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_046 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_046 was adjusted to 248
Persisted train TFRecord file for MT_046
Persisted eval TFRecord file for MT_046
Started processing for MT_047
Scaler min_max generated on training data for MT_047
Scaler min_max persisted for MT_047
Test dataset persisted as a time series pickle for MT_047
MT_047 processed. The number of examples in train dataset is 4913
MT_047 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_060
Persisted eval TFRecord file for MT_060
Started processing for MT_061
Scaler min_max generated on training data for MT_061
Scaler min_max persisted for MT_061
Test dataset persisted as a time series pickle for MT_061
MT_061 processed. The number of examples in train dataset is 4913
MT_061 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_061 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_061 was adjusted to 248
Persisted train TFRecord file for MT_061
Persisted eval TFRecord file for MT_061
Started processing for MT_062
Scaler min_max generated on training data for MT_062
Scaler min_max persisted for MT_062
Test dataset persisted as a time series pickle for MT_062
MT_062 processed. The number of examples in train dataset is 4913
MT_062 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_075
Persisted eval TFRecord file for MT_075
Started processing for MT_076
Scaler min_max generated on training data for MT_076
Scaler min_max persisted for MT_076
Test dataset persisted as a time series pickle for MT_076
MT_076 processed. The number of examples in train dataset is 4913
MT_076 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_076 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_076 was adjusted to 248
Persisted train TFRecord file for MT_076
Persisted eval TFRecord file for MT_076
Started processing for MT_077
Scaler min_max generated on training data for MT_077
Scaler min_max persisted for MT_077
Test dataset persisted as a time series pickle for MT_077
MT_077 processed. The number of examples in train dataset is 4913
MT_077 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_090
Persisted eval TFRecord file for MT_090
Started processing for MT_091
Scaler min_max generated on training data for MT_091
Scaler min_max persisted for MT_091
Test dataset persisted as a time series pickle for MT_091
MT_091 processed. The number of examples in train dataset is 4913
MT_091 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_091 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_091 was adjusted to 248
Persisted train TFRecord file for MT_091
Persisted eval TFRecord file for MT_091
Started processing for MT_092
Scaler min_max generated on training data for MT_092
Scaler min_max persisted for MT_092
Test dataset persisted as a time series pickle for MT_092
MT_092 processed. The number of examples in train dataset is 4913
MT_092 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_105
Persisted eval TFRecord file for MT_105
Started processing for MT_106
Scaler min_max generated on training data for MT_106
Scaler min_max persisted for MT_106
Test dataset persisted as a time series pickle for MT_106
MT_106 processed. The number of examples in train dataset is 4913
MT_106 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_106 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_106 was adjusted to 248
Persisted train TFRecord file for MT_106
Persisted eval TFRecord file for MT_106
Started processing for MT_107
Scaler min_max generated on training data for MT_107
Scaler min_max persisted for MT_107
Test dataset persisted as a time series pickle for MT_107
MT_107 processed. The number of examples in train dataset is 4913
MT_107 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_120
Persisted eval TFRecord file for MT_120
Started processing for MT_121
Scaler min_max generated on training data for MT_121
Scaler min_max persisted for MT_121
Test dataset persisted as a time series pickle for MT_121
MT_121 processed. The number of examples in train dataset is 4913
MT_121 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_121 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_121 was adjusted to 248
Persisted train TFRecord file for MT_121
Persisted eval TFRecord file for MT_121
Started processing for MT_122
Scaler min_max generated on training data for MT_122
Scaler min_max persisted for MT_122
Test dataset persisted as a time series pickle for MT_122
MT_122 processed. The number of examples in train dataset is 4913
MT_122 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_135
Persisted eval TFRecord file for MT_135
Started processing for MT_136
Scaler min_max generated on training data for MT_136
Scaler min_max persisted for MT_136
Test dataset persisted as a time series pickle for MT_136
MT_136 processed. The number of examples in train dataset is 4913
MT_136 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_136 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_136 was adjusted to 248
Persisted train TFRecord file for MT_136
Persisted eval TFRecord file for MT_136
Started processing for MT_137
Scaler min_max generated on training data for MT_137
Scaler min_max persisted for MT_137
Test dataset persisted as a time series pickle for MT_137
MT_137 processed. The number of examples in train dataset is 4913
MT_137 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_150
Persisted eval TFRecord file for MT_150
Started processing for MT_151
Scaler min_max generated on training data for MT_151
Scaler min_max persisted for MT_151
Test dataset persisted as a time series pickle for MT_151
MT_151 processed. The number of examples in train dataset is 4913
MT_151 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_151 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_151 was adjusted to 248
Persisted train TFRecord file for MT_151
Persisted eval TFRecord file for MT_151
Started processing for MT_152
Scaler min_max generated on training data for MT_152
Scaler min_max persisted for MT_152
Test dataset persisted as a time series pickle for MT_152
MT_152 processed. The number of examples in train dataset is 4913
MT_152 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_165
Persisted eval TFRecord file for MT_165
Started processing for MT_166
Scaler min_max generated on training data for MT_166
Scaler min_max persisted for MT_166
Test dataset persisted as a time series pickle for MT_166
MT_166 processed. The number of examples in train dataset is 4913
MT_166 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_166 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_166 was adjusted to 248
Persisted train TFRecord file for MT_166
Persisted eval TFRecord file for MT_166
Started processing for MT_167
Scaler min_max generated on training data for MT_167
Scaler min_max persisted for MT_167
Test dataset persisted as a time series pickle for MT_167
MT_167 processed. The number of examples in train dataset is 4913
MT_167 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_180
Persisted eval TFRecord file for MT_180
Started processing for MT_181
Scaler min_max generated on training data for MT_181
Scaler min_max persisted for MT_181
Test dataset persisted as a time series pickle for MT_181
MT_181 processed. The number of examples in train dataset is 4913
MT_181 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_181 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_181 was adjusted to 248
Persisted train TFRecord file for MT_181
Persisted eval TFRecord file for MT_181
Started processing for MT_182
Scaler min_max generated on training data for MT_182
Scaler min_max persisted for MT_182
Test dataset persisted as a time series pickle for MT_182
MT_182 processed. The number of examples in train dataset is 4913
MT_182 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_195
Persisted eval TFRecord file for MT_195
Started processing for MT_196
Scaler min_max generated on training data for MT_196
Scaler min_max persisted for MT_196
Test dataset persisted as a time series pickle for MT_196
MT_196 processed. The number of examples in train dataset is 4913
MT_196 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_196 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_196 was adjusted to 248
Persisted train TFRecord file for MT_196
Persisted eval TFRecord file for MT_196
Started processing for MT_197
Scaler min_max generated on training data for MT_197
Scaler min_max persisted for MT_197
Test dataset persisted as a time series pickle for MT_197
MT_197 processed. The number of examples in train dataset is 4913
MT_197 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_210
Persisted eval TFRecord file for MT_210
Started processing for MT_211
Scaler min_max generated on training data for MT_211
Scaler min_max persisted for MT_211
Test dataset persisted as a time series pickle for MT_211
MT_211 processed. The number of examples in train dataset is 4913
MT_211 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_211 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_211 was adjusted to 248
Persisted train TFRecord file for MT_211
Persisted eval TFRecord file for MT_211
Started processing for MT_212
Scaler min_max generated on training data for MT_212
Scaler min_max persisted for MT_212
Test dataset persisted as a time series pickle for MT_212
MT_212 processed. The number of examples in train dataset is 4913
MT_212 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_226
Persisted eval TFRecord file for MT_226
Started processing for MT_227
Scaler min_max generated on training data for MT_227
Scaler min_max persisted for MT_227
Test dataset persisted as a time series pickle for MT_227
MT_227 processed. The number of examples in train dataset is 4913
MT_227 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_227 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_227 was adjusted to 248
Persisted train TFRecord file for MT_227
Persisted eval TFRecord file for MT_227
Started processing for MT_228
Scaler min_max generated on training data for MT_228
Scaler min_max persisted for MT_228
Test dataset persisted as a time series pickle for MT_228
MT_228 processed. The number of examples in train dataset is 4913
MT_228 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_241
Persisted eval TFRecord file for MT_241
Started processing for MT_242
Scaler min_max generated on training data for MT_242
Scaler min_max persisted for MT_242
Test dataset persisted as a time series pickle for MT_242
MT_242 processed. The number of examples in train dataset is 4913
MT_242 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_242 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_242 was adjusted to 248
Persisted train TFRecord file for MT_242
Persisted eval TFRecord file for MT_242
Started processing for MT_243
Scaler min_max generated on training data for MT_243
Scaler min_max persisted for MT_243
Test dataset persisted as a time series pickle for MT_243
MT_243 processed. The number of examples in train dataset is 4913
MT_243 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_256
Persisted eval TFRecord file for MT_256
Started processing for MT_257
Scaler min_max generated on training data for MT_257
Scaler min_max persisted for MT_257
Test dataset persisted as a time series pickle for MT_257
MT_257 processed. The number of examples in train dataset is 4913
MT_257 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_257 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_257 was adjusted to 248
Persisted train TFRecord file for MT_257
Persisted eval TFRecord file for MT_257
Started processing for MT_258
Scaler min_max generated on training data for MT_258
Scaler min_max persisted for MT_258
Test dataset persisted as a time series pickle for MT_258
MT_258 processed. The number of examples in train dataset is 4913
MT_258 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_271
Persisted eval TFRecord file for MT_271
Started processing for MT_272
Scaler min_max generated on training data for MT_272
Scaler min_max persisted for MT_272
Test dataset persisted as a time series pickle for MT_272
MT_272 processed. The number of examples in train dataset is 4913
MT_272 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_272 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_272 was adjusted to 248
Persisted train TFRecord file for MT_272
Persisted eval TFRecord file for MT_272
Started processing for MT_273
Scaler min_max generated on training data for MT_273
Scaler min_max persisted for MT_273
Test dataset persisted as a time series pickle for MT_273
MT_273 processed. The number of examples in train dataset is 4913
MT_273 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_286
Persisted eval TFRecord file for MT_286
Started processing for MT_287
Scaler min_max generated on training data for MT_287
Scaler min_max persisted for MT_287
Test dataset persisted as a time series pickle for MT_287
MT_287 processed. The number of examples in train dataset is 4913
MT_287 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_287 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_287 was adjusted to 248
Persisted train TFRecord file for MT_287
Persisted eval TFRecord file for MT_287
Started processing for MT_288
Scaler min_max generated on training data for MT_288
Scaler min_max persisted for MT_288
Test dataset persisted as a time series pickle for MT_288
MT_288 processed. The number of examples in train dataset is 4913
MT_288 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_301
Persisted eval TFRecord file for MT_301
Started processing for MT_302
Scaler min_max generated on training data for MT_302
Scaler min_max persisted for MT_302
Test dataset persisted as a time series pickle for MT_302
MT_302 processed. The number of examples in train dataset is 4913
MT_302 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_302 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_302 was adjusted to 248
Persisted train TFRecord file for MT_302
Persisted eval TFRecord file for MT_302
Started processing for MT_303
Scaler min_max generated on training data for MT_303
Scaler min_max persisted for MT_303
Test dataset persisted as a time series pickle for MT_303
MT_303 processed. The number of examples in train dataset is 4913
MT_303 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_316
Persisted eval TFRecord file for MT_316
Started processing for MT_317
Scaler min_max generated on training data for MT_317
Scaler min_max persisted for MT_317
Test dataset persisted as a time series pickle for MT_317
MT_317 processed. The number of examples in train dataset is 4913
MT_317 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_317 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_317 was adjusted to 248
Persisted train TFRecord file for MT_317
Persisted eval TFRecord file for MT_317
Started processing for MT_318
Scaler min_max generated on training data for MT_318
Scaler min_max persisted for MT_318
Test dataset persisted as a time series pickle for MT_318
MT_318 processed. The number of examples in train dataset is 4913
MT_318 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_331
Persisted eval TFRecord file for MT_331
Started processing for MT_332
Scaler min_max generated on training data for MT_332
Scaler min_max persisted for MT_332
Test dataset persisted as a time series pickle for MT_332
MT_332 processed. The number of examples in train dataset is 4913
MT_332 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_332 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_332 was adjusted to 248
Persisted train TFRecord file for MT_332
Persisted eval TFRecord file for MT_332
Started processing for MT_333
Scaler min_max generated on training data for MT_333
Scaler min_max persisted for MT_333
Test dataset persisted as a time series pickle for MT_333
MT_333 processed. The number of examples in train dataset is 4913
MT_333 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_346
Persisted eval TFRecord file for MT_346
Started processing for MT_347
Scaler min_max generated on training data for MT_347
Scaler min_max persisted for MT_347
Test dataset persisted as a time series pickle for MT_347
MT_347 processed. The number of examples in train dataset is 4913
MT_347 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_347 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_347 was adjusted to 248
Persisted train TFRecord file for MT_347
Persisted eval TFRecord file for MT_347
Started processing for MT_348
Scaler min_max generated on training data for MT_348
Scaler min_max persisted for MT_348
Test dataset persisted as a time series pickle for MT_348
MT_348 processed. The number of examples in train dataset is 4913
MT_348 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

Persisted train TFRecord file for MT_361
Persisted eval TFRecord file for MT_361
Started processing for MT_362
Scaler min_max generated on training data for MT_362
Scaler min_max persisted for MT_362
Test dataset persisted as a time series pickle for MT_362
MT_362 processed. The number of examples in train dataset is 4913
MT_362 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_362 was adjusted to 4912
For 8 cores in Cloud TPU, the number of eval examples for MT_362 was adjusted to 248
Persisted train TFRecord file for MT_362
Persisted eval TFRecord file for MT_362
Started processing for MT_363
Scaler min_max generated on training data for MT_363
Scaler min_max persisted for MT_363
Test dataset persisted as a time series pickle for MT_363
MT_363 processed. The number of examples in train dataset is 4913
MT_363 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train exampl

In [58]:
# expand the sldb dictionary with final statistics
sldb['stats'] = {
    'train': {
        'n_rows': 54032,
    },
    'eval': {
        'n_rows': 2728
    }
}

In [59]:
sldb

{'ts': 'LD2011-2014_SEPARATED_MT_320-MT_330',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'BSCTRFM': 1,
 'preprocessed': 1,
 'stats': {'train': {'n_rows': 54032}, 'eval': {'n_rows': 2728}}}

In [60]:
json_filename = '{}/sldb.json'.format(sldb_dir)

In [61]:
with open(json_filename, 'w') as filename:
    json.dump(sldb, filename, indent=4)