In [1]:
# prototype for new version of predict_and_plot.py
# now called make_predictions.py

In [2]:
import argparse
# replace it later with Abseil Flags

In [3]:
import os

In [4]:
import json

In [5]:
import numpy as np

In [6]:
import tensorflow.compat.v1 as tf
from tensorflow.contrib.predictor import from_saved_model

In [7]:
from pandas import to_datetime

In [9]:
from sklearn.externals import joblib

In [10]:
# required to get the last saved model in model_dir
from google.cloud import storage

In [11]:
storage_client = storage.Client()
_BUCKET_NAME = 'cbidmltsf'
bucket = storage_client.get_bucket(_BUCKET_NAME)

In [12]:
# from dplstm.data import _parse_dataset_function
# define here the _parse_dataset_function, just for prototyping

In [13]:
read_features = {
    'hourly': tf.io.VarLenFeature(dtype=tf.float32),
    'daily': tf.io.VarLenFeature(dtype=tf.float32),
    'weekly': tf.io.VarLenFeature(dtype=tf.float32),
    'target': tf.io.VarLenFeature(dtype=tf.float32),
    'oh_wd': tf.io.VarLenFeature(dtype=tf.float32),
    'oh_dh': tf.io.VarLenFeature(dtype=tf.float32),
    'timestamp': tf.io.VarLenFeature(dtype=tf.string)
}

In [14]:
def _parse_dataset_function(example_proto, objective_shapes, parse_timestamp):
    # parse the input tf.Example proto using the dictionary above
    row = tf.io.parse_single_example(example_proto, read_features)
    # pass objective shape as a list of lists [hourly_shape, daily_shape, weekly_shape]
    hourly = tf.reshape(row['hourly'].values, objective_shapes['hourly'])
    daily = tf.reshape(row['daily'].values, objective_shapes['daily'])
    weekly = tf.reshape(row['weekly'].values, objective_shapes['weekly'])
    target = tf.reshape(row['target'].values, objective_shapes['target'])
    oh_wd = tf.reshape(row['oh_wd'].values, objective_shapes['oh_wd'])
    oh_dh = tf.reshape(row['oh_dh'].values, objective_shapes['oh_dh'])
    # do not parse the timestamp to TPUEstimator, as it does not support string types!
    # ToDo: code timestamps into features, as numbers
    #  so they can be parsed to training
    timestamp = tf.reshape(row['timestamp'].values, objective_shapes['timestamp'])
    # the parsed dataset must have the shape {features}, target!!!
    # so:
    feature_dict = {
        'hourly': hourly,
        'daily': daily,
        'weekly': weekly,
        'oh_wd': oh_wd,
        'oh_dh': oh_dh,
    }
    # Do not parse the timestamp for training!!! Strings are not supported in TPUs!!!,
    # or parse it as a number
    if parse_timestamp:
        feature_dict['timestamp'] = timestamp

    return feature_dict, target[0]

In [15]:
# pass the model dir as flag... later

In [16]:
# the model identifier
MODEL_ID = 'DMSLSTM_TPU_70_00'

In [17]:
model_dir = 'gs://cbidmltsf/models/{}'.format(MODEL_ID)

In [19]:
model_dir

'gs://cbidmltsf/models/DMSLSTM_TPU_70_00'

In [20]:
# pass the data dir as flag... later

In [21]:
data_dir='gs://cbidmltsf/sldbs/CPE04015_desbI_H_2017-04-01_00:00:00_2018-02-28_23:00:00_H008001001_D008024001_W004168001'

In [22]:
# keep model_dir as used in training script
# but remove the particle 'cbidmltsf' in saved model path
# because the name of the bucket was already passed to Storage API client

In [23]:
saved_model_path = '{}/export/exporter'.format(model_dir.replace('gs://cbidmltsf/', ''))

In [24]:
saved_model_path

'models/DMSLSTM_TPU_70_00/export/exporter'

In [25]:
def list_files(bucket_folder):
    """List all files in GCP bucket."""
    # ToDo: restructure the main function as this method uses the global variable 'bucket'
    files = bucket.list_blobs(prefix=bucket_folder)
    files_list = [file.name for file in files if '.' in file.name]
    return files_list

In [26]:
all_files = list_files(saved_model_path)

In [27]:
all_files

['models/DMSLSTM_TPU_70_00/export/exporter/1601003254/saved_model.pb',
 'models/DMSLSTM_TPU_70_00/export/exporter/1601003254/variables/variables.data-00000-of-00001',
 'models/DMSLSTM_TPU_70_00/export/exporter/1601003254/variables/variables.index']

In [28]:
# isolate the names of the subdirectories in export/exporter (one for each training process)
prefix_length = len(saved_model_path)

In [29]:
prefix_length

40

In [30]:
latest_saved_model_id = sorted(list(set([file[prefix_length+1:prefix_length+11] for file in all_files])))[-1]

In [31]:
latest_saved_model_id

'1601003254'

In [32]:
_LATEST_SAVED_MODEL_DIR = 'gs://{0}/{1}/{2}'.format(_BUCKET_NAME,
                                                    saved_model_path,
                                                    latest_saved_model_id)

In [33]:
_LATEST_SAVED_MODEL_DIR

'gs://cbidmltsf/models/DMSLSTM_TPU_70_00/export/exporter/1601003254'

In [34]:
# build a prediction function
predict_fn = from_saved_model(_LATEST_SAVED_MODEL_DIR)

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from gs://cbidmltsf/models/DMSLSTM_TPU_70_00/export/exporter/1601003254/variables/variables


In [35]:
# pass dataset as a flag, later...
dataset = 'test.tfrecord'

In [36]:
dataset_path = '{}/{}'.format(data_dir, dataset)

In [37]:
tfrecord_dataset = tf.data.TFRecordDataset(dataset_path)

In [38]:
mapped_dataset = tfrecord_dataset.map(lambda row: row)

In [39]:
# ToDo: find a way to perform two operations on map iterator:
#  first, get the prediction with predict_fn, and
#  second, get the target and the timestamp with a custom parsing
#  (in order to avoid double iteration)
iterator = tf.data.make_one_shot_iterator(mapped_dataset)

In [40]:
next_element = iterator.get_next()

In [41]:
# refine this iterator
# it is extremely slow!!!
predictions_list = []
with tf.Session() as sess:
    try:
        while True:
            example = sess.run(next_element)
            # print(example)
            predictions_list.append(predict_fn({'example_bytes': example}))
    except:
        pass

In [42]:
# get scalar values from predictions list
predictions = [p['forecast'][0][0] for p in predictions_list]

In [43]:
# pass predictions to an array
predictions_array = np.asarray(predictions)

In [44]:
predictions_array

array([0.5441618 , 0.51966685, 0.4989715 , 0.30466068, 0.43263328,
       0.485386  , 0.63434696, 0.5707552 , 0.36966532, 0.39843956,
       0.23514184, 0.35869274, 0.299948  , 0.26620835, 0.28182584,
       0.27639094, 0.3767143 , 0.35652915, 0.3824992 , 0.26447755,
       0.46342632, 0.6080722 , 0.6079163 , 0.59445155, 0.54838866,
       0.57579786, 0.48318502, 0.3260118 , 0.17182802, 0.65886503,
       0.47180066, 0.6183016 , 0.5201679 , 0.36135674, 0.3050614 ,
       0.37105584, 0.25564414, 0.28394815, 0.25962022, 0.2327499 ,
       0.31193796, 0.36735025, 0.4288793 , 0.32161957, 0.43811166,
       0.24301349, 0.13896659, 0.39845747, 0.65929943, 0.56995124,
       0.4806725 , 0.39882636, 0.5090357 , 0.53697914, 0.3980365 ,
       0.537365  , 0.40750906, 0.41606393, 0.3672334 , 0.2501844 ,
       0.27875525, 0.31571215, 0.3384695 , 0.23450202, 0.39851338,
       0.56957823, 0.35865355, 0.49883685, 0.51750875, 0.63353014,
       0.65299577, 0.47594863, 0.5254238 , 0.45741257, 0.53655

In [45]:
# is it possible to load the scaler directly from Google Storage?
# unfortunately, not...
# scaler = joblib.load('scaler.save')

In [46]:
# build a path to the scaler file in GCP
scaler_path = '{}/scaler.save'.format(data_dir.replace('gs://cbidmltsf/', ''))

In [47]:
blob = bucket.blob(scaler_path)

In [48]:
# temporarily download the scaler to VM's block storage, use the original name
blob.download_to_filename('temp/scaler.save')

In [49]:
scaler = joblib.load('temp/scaler.save')



In [50]:
# inverse-scale predictions
predictions = scaler.inverse_transform(predictions_array.reshape(-1, 1))

In [51]:
# remove all dimensions equal to 1 in the predictions array
predictions = np.squeeze(predictions)

In [52]:
# pass final prediction values to list for json serialization
prediction_values_list = predictions.tolist()

In [53]:
# embedding dimensions of SLDB can be retrieved from sldb/sldb.json
# or from the stats/sldb_parameters.json (both in Google Storage)
# use the json file in stats/

In [60]:
# build a path to the json file in GCP
sldb_json_path = '{}/sldb_parameters.json'.format(model_dir.replace('gs://cbidmltsf/models', 'stats'))

In [61]:
sldb_json_path

'stats/DMSLSTM_TPU_70_00/sldb_parameters.json'

In [62]:
blob = bucket.blob(sldb_json_path)

In [63]:
# temporarily download the sldb json to VM's block storage, use the original name
blob.download_to_filename('temp/sldb_parameters.json')

In [64]:
# recover the sldb dictionary from the sldb json file in vm's block storage
with open('temp/sldb_parameters.json', 'r') as filename:
    sldb_parameters = json.load(filename)

In [67]:
# store the objective shapes for reshaping tensors in a dictionary
_EXTRACTING_OBJECTIVE_SHAPES = {
    'hourly': [sldb_parameters['embedding']['hourly'], 1],
    'daily': [sldb_parameters['embedding']['daily'], 1],
    'weekly': [sldb_parameters['embedding']['weekly'], 1],
    # number of targets is included in hourly, daily, and weekly features, take anyone of them
    # ToDo: un-wire this!
    # 'target': [sldb['components']['hourly']['no_targets'], 1],
    'target': [1, 1],
    'oh_wd': [7, 1],  # Monday to Sunday
    'oh_dh': [24, 1],  # midnight to 23:00
    # number of targets is included in hourly, daily, and weekly features, take anyone of them
    # ToDo: un-wire this!
    # 'timestamp': [sldb['components']['hourly']['no_targets'], 1]
    'timestamp': [1, 1]
}

In [69]:
# test_dataset was previously acquired from tfrecord file
# use it again to build arrays for targets and timestamps
parsed_dataset = tfrecord_dataset.map(lambda row: _parse_dataset_function(example_proto=row,
                                                                          objective_shapes=_EXTRACTING_OBJECTIVE_SHAPES,
                                                                          parse_timestamp=True))

In [70]:
iterator = tf.data.make_one_shot_iterator(parsed_dataset)
next_element = iterator.get_next()

In [71]:
# ToDo: merge the operations in this iterator in the previous use of the same iterator???
timestamps_list = []
targets_list = []
with tf.Session() as sess:
    try:
        while True:
            parsed_example = sess.run(next_element)
            # print(example)
            # parsed_example = _parse_dataset_function(example_proto=example,
            #                                          objective_shapes=_EXTRACTING_OBJECTIVE_SHAPES,
            #                                          parse_timestamp=True)
            # print(parsed_example)
            # dplstm.data._parse_dataset_function returns (feature_dict, target[0]), then
            # parsed_example[0] = feature_dict,
            # parsed_example[1] = target[0]
            # get the timestamp, then the scalar value, then the string value, then convert it to datetime
            # ToDo: isolate the string timestamp from parsed_example first,
            #  then apply decoding and conversion to datetime.
            #  This way a string timestamp list is obtained for use in json results file
            timestamps_list.append(to_datetime(parsed_example[0]['timestamp'][0][0].decode()))
            # get the target, then the scalar value
            # targets_list.append(item['target'][0][0])
            targets_list.append(parsed_example[1][0])
    except:
        pass

In [73]:
timestamps = np.asarray(timestamps_list)
# get the timestamps_list in strings, to better persist values
string_timestamps_list = [str(item) for item in timestamps_list]
targets = np.asarray(targets_list)

In [74]:
targets = scaler.inverse_transform(targets.reshape(-1, 1))
targets = np.squeeze(targets)
# NumPy array to list for json serialization
target_values_list = targets.tolist()

In [75]:
# a dictionary to manage prediction results
prediction_results = {
    'string_timestamps': string_timestamps_list,
    'predictions': prediction_values_list,
    'targets': target_values_list
}

In [76]:
# ToDo: replace this dictionary with a data frame? (Pandas?, csv?)

In [84]:
# pass the results dictionary to json
output_file_name = 'temp/prediction_results_on_{}.json'.format(dataset.replace('.', '_'))

In [85]:
output_file_name

'temp/prediction_results_on_test_tfrecord.json'

In [87]:
with open(output_file_name, 'w') as outfile:
    json.dump(prediction_results, outfile, indent=4)

In [88]:
# now upload the resulting json file in /temp to gs://cbidmltsf/stats/MODEL_ID

In [91]:
# build a path to stats directory in Google Storage
# use the model dir as a base
stats_path = model_dir.replace('gs://cbidmltsf/models/', 'stats/')

In [92]:
stats_path

'stats/DMSLSTM_TPU_70_00'

In [99]:
results_blob_id = '{}/{}'.format(stats_path, output_file_name.replace('temp/', ''))

In [100]:
results_blob_id

'stats/DMSLSTM_TPU_70_00/prediction_results_on_test_tfrecord.json'

In [101]:
# now upload prediction results json file from vm block storage to hparams['model_dir']
results_blob = bucket.blob(results_blob_id)
results_blob.upload_from_filename(output_file_name)

In [102]:
# this prototype is now ready for coding the final script, right?