In [1]:
import json

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
import joblib

In [5]:
import tensorflow.compat.v1 as tf

In [6]:
# forecast model was saved in TensorFlow 1.15
# but, in order to make predictions locally, has to be loaded with TensorFlow 2
# therefore, get and test the appropriate function
from tensorflow.saved_model import load

In [7]:
from tensorboard.backend.event_processing import event_accumulator

In [8]:
from datetime import datetime

In [9]:
# via Abseil Flags
model_id = 'DMSLSTM_TPU_000'

In [10]:
# via Abseil Flags
execution = 0

In [11]:
model_dir = '{}_{:02d}'.format(model_id, execution)

In [12]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

In [13]:
# start with a given last saved model dir
# it will be automatically found on production version
last_saved_model_suffix = 'export/exporter/1603508757'

In [14]:
export_dir = '{}/{}/{}/{}'.format(PROJECT_ROOT, 'models', model_dir, last_saved_model_suffix)
tags = 'serve'
imported = load(export_dir=export_dir, tags=tags)

In [15]:
predict_fn = imported.signatures["serving_default"]

In [16]:
# via Abseil Flags
sldb_id = 'CPE04115_H_kw_20201021084001_008001001_008024001_004168001'

In [17]:
data_dir = '{}/{}/{}'.format(PROJECT_ROOT, 'sldbs', sldb_id)

In [19]:
# get the ts_identifier from the json file in the sldb directory

In [21]:
sldb_json_file = '{}/sldb.json'.format(data_dir)

In [22]:
with open(sldb_json_file, 'r') as inputfile:
    sldb_dict = json.load(inputfile)

In [41]:
ts_identifier = sldb_dict['ts']

In [106]:
# via Abseil Flags
dataset = 'test'

In [31]:
dataset_path = '{}/{}.tfrecord'.format(data_dir, dataset)

In [32]:
dataset_path

'/home/developer/gcp/cbidmltsf/sldbs/CPE04115_H_kw_20201021084001_008001001_008024001_004168001/test.tfrecord'

In [33]:
tfrecord_dataset = tf.data.TFRecordDataset(dataset_path)

In [34]:
predictions_list = list()

In [35]:
for element in tfrecord_dataset:
    predictions_list.append(predict_fn(element))

In [36]:
# get scalar values from predictions list
predictions = [p['forecast'][0][0] for p in predictions_list]

In [37]:
# pass predictions to an array
predictions_array = np.asarray(predictions)

In [38]:
len(predictions_array)

1591

In [39]:
predictions_array[:5]

array([0.65809375, 0.66343164, 0.6697949 , 0.7323097 , 0.7499685 ],
      dtype=float32)

In [42]:
scaler = joblib.load('/{}/{}/{}/scaler.save'.format(PROJECT_ROOT,
                                                    'timeseries',
                                                    ts_identifier))

In [43]:
# inverse-scale predictions
predictions = scaler.inverse_transform(predictions_array.reshape(-1, 1))

In [44]:
# remove all dimensions equal to 1 in the predictions array
predictions = np.squeeze(predictions)

In [45]:
# pass final prediction values to list for json serialization
prediction_values_list = predictions.tolist()

In [46]:
prediction_values_list[:10]

[2914.691162109375,
 2926.17431640625,
 2939.863525390625,
 3074.3505859375,
 3112.33984375,
 3034.739990234375,
 2734.530517578125,
 2470.22509765625,
 2239.215576171875,
 2104.2314453125]

In [49]:
parameters_json_file = '{}/{}/{}/sldb_parameters.json'.format(PROJECT_ROOT,
                                                              'parameters',
                                                              model_id)

In [50]:
# recover the sldb dictionary from the json file in parameters/
with open(json_file, 'r') as inputfile:
    sldb_parameters = json.load(inputfile)

In [51]:
# store the objective shapes for reshaping tensors in a dictionary
_EXTRACTING_OBJECTIVE_SHAPES = {
    'hourly': [sldb_parameters['embedding']['hourly'], 1],
    'daily': [sldb_parameters['embedding']['daily'], 1],
    'weekly': [sldb_parameters['embedding']['weekly'], 1],
    # number of targets is included in hourly, daily, and weekly features, take anyone of them
    # ToDo: un-wire this!
    # 'target': [sldb['components']['hourly']['no_targets'], 1],
    'target': [1, 1],
    'oh_wd': [7, 1],  # Monday to Sunday
    'oh_dh': [24, 1],  # midnight to 23:00
    # number of targets is included in hourly, daily, and weekly features, take anyone of them
    # ToDo: un-wire this!
    # 'timestamp': [sldb['components']['hourly']['no_targets'], 1]
    'timestamp': [1, 1]
}

In [52]:
# this code block will be imported as:
# from dplstm.data import _parse_dataset_function
read_features = {
    'hourly': tf.io.VarLenFeature(dtype=tf.float32),
    'daily': tf.io.VarLenFeature(dtype=tf.float32),
    'weekly': tf.io.VarLenFeature(dtype=tf.float32),
    'target': tf.io.VarLenFeature(dtype=tf.float32),
    'oh_wd': tf.io.VarLenFeature(dtype=tf.float32),
    'oh_dh': tf.io.VarLenFeature(dtype=tf.float32),
    'timestamp': tf.io.VarLenFeature(dtype=tf.string)
}


def _parse_dataset_function(example_proto, objective_shapes, parse_timestamp):
    # parse the input tf.Example proto using the dictionary above
    row = tf.io.parse_single_example(example_proto, read_features)
    # pass objective shape as a list of lists [hourly_shape, daily_shape, weekly_shape]
    hourly = tf.reshape(row['hourly'].values, objective_shapes['hourly'])
    daily = tf.reshape(row['daily'].values, objective_shapes['daily'])
    weekly = tf.reshape(row['weekly'].values, objective_shapes['weekly'])
    target = tf.reshape(row['target'].values, objective_shapes['target'])
    oh_wd = tf.reshape(row['oh_wd'].values, objective_shapes['oh_wd'])
    oh_dh = tf.reshape(row['oh_dh'].values, objective_shapes['oh_dh'])
    # do not parse the timestamp to TPUEstimator, as it does not support string types!
    # ToDo: code timestamps into features, as numbers
    #  so they can be parsed to training
    timestamp = tf.reshape(row['timestamp'].values, objective_shapes['timestamp'])
    # the parsed dataset must have the shape {features}, target!!!
    # so:
    feature_dict = {
        'hourly': hourly,
        'daily': daily,
        'weekly': weekly,
        'oh_wd': oh_wd,
        'oh_dh': oh_dh,
    }
    # Do not parse the timestamp for training!!! Strings are not supported in TPUs!!!,
    # or parse it as a number
    if parse_timestamp:
        feature_dict['timestamp'] = timestamp

    return feature_dict, target[0]

In [53]:
# test_dataset was previously acquired from tfrecord file
# use it again to build arrays for targets and timestamps
parsed_dataset = tfrecord_dataset.map(lambda row: _parse_dataset_function(example_proto=row,
                                                                          objective_shapes=_EXTRACTING_OBJECTIVE_SHAPES,
                                                                          parse_timestamp=True))

In [90]:
string_timestamps_list = list()

In [91]:
targets_list = list()

In [92]:
for parsed_example in parsed_dataset:
    string_timestamp = str(np.asarray(parsed_example[0]['timestamp'][0][0]).astype(str))
    string_timestamps_list.append(string_timestamp)
    target = parsed_example[1][0]
    targets_list.append(target)

In [93]:
string_timestamps_list[:5]

['2018-05-26 17:00:00',
 '2018-05-26 18:00:00',
 '2018-05-26 19:00:00',
 '2018-05-26 20:00:00',
 '2018-05-26 21:00:00']

In [94]:
targets_list[:5]

[<tf.Tensor: shape=(), dtype=float32, numpy=0.65253454>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.6731292>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.69301194>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.73025095>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.78971857>]

In [95]:
targets_array = np.asarray(targets_list)

In [96]:
targets_array = scaler.inverse_transform(targets_array.reshape(-1, 1))

In [97]:
targets_array = np.squeeze(targets_array)

In [98]:
target_values_list = targets_array.tolist()

In [99]:
target_values_list[:5]

[2902.731689453125,
 2947.03662109375,
 2989.81005859375,
 3069.921630859375,
 3197.853515625]

In [100]:
# a dictionary to manage prediction results
prediction_results = {
    'string_timestamps': string_timestamps_list,
    'predictions': prediction_values_list,
    'targets': target_values_list
}

In [109]:
output_filename = '{}/{}/{}/{}_on_{}_tfrecord.json'.format(PROJECT_ROOT,
                                                           'stats',
                                                           'predictions',
                                                           model_dir,
                                                           dataset)

In [110]:
# pass the results dictionary to json
with open(output_filename, 'w') as outfile:
    json.dump(prediction_results, outfile, indent=4)

In [111]:
# so far the json file for stats/predictions/ is ready...

In [112]:
# now use the data in memory to produce the pickle file for database/predictions/

In [113]:
# a columns list for the predictions dataframe
pred_df_columns = ['model_id', 'execution', 'dataset', 'string_timestamp', 'prediction', 'target']

In [115]:
# build the dataframe
# how many predictions in the dataset?
length = len(prediction_results['predictions'])

In [117]:
# a list with model_id repeated length times
model_id_repeat_list = [model_id]*length
# same for execution
execution_repeat_list = [execution]*length
# same for dataset
dataset_repeat_list = [dataset]*length

In [121]:
# predictions dataframe
predictions_df = pd.DataFrame(list(zip(model_id_repeat_list,
                                       execution_repeat_list,
                                       dataset_repeat_list,
                                       prediction_results['string_timestamps'],
                                       prediction_results['predictions'],
                                       prediction_results['targets'])), columns=pred_df_columns)

In [122]:
predictions_df

Unnamed: 0,model_id,execution,dataset,string_timestamp,prediction,target
0,DMSLSTM_TPU_000,0,test,2018-05-26 17:00:00,2914.691162,2902.731689
1,DMSLSTM_TPU_000,0,test,2018-05-26 18:00:00,2926.174316,2947.036621
2,DMSLSTM_TPU_000,0,test,2018-05-26 19:00:00,2939.863525,2989.810059
3,DMSLSTM_TPU_000,0,test,2018-05-26 20:00:00,3074.350586,3069.921631
4,DMSLSTM_TPU_000,0,test,2018-05-26 21:00:00,3112.339844,3197.853516
...,...,...,...,...,...,...
1586,DMSLSTM_TPU_000,0,test,2018-07-31 19:00:00,2929.029297,2902.200195
1587,DMSLSTM_TPU_000,0,test,2018-07-31 20:00:00,3022.406006,2912.063232
1588,DMSLSTM_TPU_000,0,test,2018-07-31 21:00:00,2983.091064,2983.391602
1589,DMSLSTM_TPU_000,0,test,2018-07-31 22:00:00,2779.133301,2810.393311


In [124]:
# build a path to persist the dataframe to database/predictions/
pickle_path = '{}/{}/{}/{}_{:02d}_on_{}_tfrecord.pkl'.format(PROJECT_ROOT,
                                                             'database',
                                                             'predictions',
                                                             model_id,
                                                             execution,
                                                             dataset)

In [125]:
# persist the Pandas dataframe to database/predictions/
predictions_df.to_pickle(pickle_path)
print('Persisted Pandas dataframe for predictions of {}_{:02d} on {}.tfrecord'.format(model_id,
                                                                                      execution,
                                                                                      dataset))

Persisted Pandas dataframe for predictions of DMSLSTM_TPU_000_00 on test.tfrecord


In [126]:
# now get wall times from TensorBoard summaries for model training,
# and persist them to json files in stats/training_wall_times,
# and to pickle files in database/training_wall_times

In [128]:
def get_wall_time(path_to_logdir):
    '''
    receives a UNIX path to the TensorBoard logdir of a model training,
    returns the wall time for the model training process
    '''
    # an event accumulator to the logdir
    ea = event_accumulator.EventAccumulator(path_to_logdir,
                                            size_guidance={ # see below regarding this argument
                                                # event_accumulator.COMPRESSED_HISTOGRAMS: 500, # not used
                                                # event_accumulator.IMAGES: 4, # not used
                                                # event_accumulator.AUDIO: 4, # not used
                                                event_accumulator.SCALARS: 0, # retrieve all
                                                event_accumulator.TENSORS: 0, # retrieve all
                                                # event_accumulator.HISTOGRAMS: 1 # not used
                                            }
                                           )
    # loads events from file
    ea.Reload()
    
    # wall time is end time - start time
    wall_time = ea.Tensors('loss')[-1][0] - ea.Tensors('loss')[0][0]
    print("Wall time for model in '{}' is {} seconds.".format(path_to_logdir,
                                                            wall_time))
    return wall_time

In [132]:
tensorboard_log_dir = '{}/{}/{}'.format(PROJECT_ROOT, 'models', model_dir)

In [134]:
wall_time = get_wall_time(tensorboard_log_dir)

Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_000_00' is 23.646011114120483 seconds.


In [135]:
# create a Python dictionary to store the wall_time value
wt_dictionary = {
    'wall_time': wall_time
}

In [140]:
# build a path to stats/training_wall_times/ to persist the json file
output_filename = '{}/{}/{}/{}.json'.format(PROJECT_ROOT, 'stats', 'training_wall_times', model_dir)

In [141]:
output_filename

'/home/developer/gcp/cbidmltsf/stats/training_wall_times/DMSLSTM_TPU_000_00.json'

In [142]:
# and persist the dictionary as json file
with open(output_filename, 'w') as outfile:
    json.dump(wt_dictionary, outfile, indent=4)

In [143]:
# a columns list for the wall times dataframe
wt_df_columns = ['model_id', 'execution', 'wall_time']

In [146]:
# build the dataframe
wall_times_df = pd.DataFrame([[model_id,
                               execution,
                               wt_dictionary['wall_time']]], columns=wt_df_columns)

In [149]:
# build a path to persist the dataframe to database/predictions/
pickle_path = '{}/{}/{}/{}_{:02d}.pkl'.format(PROJECT_ROOT,
                                                             'database',
                                                             'training_wall_times',
                                                             model_id,
                                                             execution)

In [151]:
# persist the Pandas dataframe to database/
wall_times_df.to_pickle(pickle_path)
print('Persisted Pandas dataframe for wall time of {}_{:02d}'.format(model_id,
                                                                     execution))

Persisted Pandas dataframe for wall time of DMSLSTM_TPU_000_00
