In [4]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [5]:
# Feature engineering: from time series to a supervised-learning database
#### current dataset is hourly-based

In [6]:
# define each experiment with a description (header) and components (body) dictionary
# NOT REQUIRED: THE DICTIONARY IS ALREADY PERSISTED AS JSON
# experiment = {
#     'description': { # useful to build the time series from the appropriate Parquet files
#         'equipment': 'CPE04015',
#         # 'variables': ['Van', 'Vbn', 'Vcn', 'ia', 'ib', 'ic', 'kw', 'kvar','desbI'],
#         'resolution': 'hourly',
#         'variables': ['desbI'],
#         'start': '2017-04-01 00:00:00',
#         'end': '2018-02-28 23:00:00'
#     },
#     'components': {
#        'hourly': {
#             'm': 24,
#             'tau': 1,
#             'no_targets': 1
#         },
#         'daily': {
#             'm': 7,
#             'tau': 24,
#             'no_targets': 1            
#         },
#         'weekly': {
#             'm': 4,
#             'tau': 168,
#             'no_targets': 1
#         }
#     }
#     
# }

In [7]:
import json

In [8]:
# experiment must be defined as a json file, then load it from disk
with open('/home/jupyter/gcp/cbidmltsf/data/json/experiment.json', 'r') as filename:
    experiment = json.load(filename)

In [9]:
experiment['description']

{'end': '2018-02-28 23:00:00',
 'equipment': 'CPE04015',
 'resolution': 'hourly',
 'start': '2017-04-01 00:00:00',
 'variables': ['desbI']}

In [10]:
experiment['components']

{'daily': {'m': 7, 'no_targets': 1, 'tau': 24},
 'hourly': {'m': 24, 'no_targets': 1, 'tau': 1},
 'weekly': {'m': 4, 'no_targets': 1, 'tau': 168}}

In [11]:
# tracing a working path
from os import listdir
from os.path import dirname, join
import sys

# pathlib library to manage paths as Posix path objects instead of as strings
from pathlib import Path

# Parquet support for Pandas
import pyarrow

# scale datasets to improve LSTM network performance
from sklearn.preprocessing import MinMaxScaler

# persistence for scaler
from sklearn.externals import joblib



In [12]:
# Anaconda Interactive Visualization
from bokeh.plotting import figure, show
from bokeh.plotting import output_file, save
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.models import Span
output_notebook()

In [13]:
import os

In [14]:
# prepare the path to raw data
RAW_DATA_DIR = '{0}/gcp/cbidmltsf/data/raw'.format(os.getenv('HOME'))

In [15]:
RAW_DATA_DIR

'/home/jupyter/gcp/cbidmltsf/data/raw'

In [16]:
data_path = '{0}/{1}/{2}.parquet'.format(RAW_DATA_DIR,
                                         experiment['description']['resolution'],
                                         experiment['description']['equipment'])

In [17]:
data_path

'/home/jupyter/gcp/cbidmltsf/data/raw/hourly/CPE04015.parquet'

In [18]:
path_to_equipment_dir = Path(data_path)
path_to_equipment_dir

PosixPath('/home/jupyter/gcp/cbidmltsf/data/raw/hourly/CPE04015.parquet')

In [19]:
# a list with the initial Posix paths to Parquet files containing the required time series
# just to make sure they are there
[parquet_file for parquet_file in path_to_equipment_dir.rglob('*.parquet')][:5]

[PosixPath('/home/jupyter/gcp/cbidmltsf/data/raw/hourly/CPE04015.parquet/part-00081-4794e0f0-278e-45db-938b-4f17b96b00e7-c000.snappy.parquet'),
 PosixPath('/home/jupyter/gcp/cbidmltsf/data/raw/hourly/CPE04015.parquet/part-00038-4794e0f0-278e-45db-938b-4f17b96b00e7-c000.snappy.parquet'),
 PosixPath('/home/jupyter/gcp/cbidmltsf/data/raw/hourly/CPE04015.parquet/part-00106-4794e0f0-278e-45db-938b-4f17b96b00e7-c000.snappy.parquet'),
 PosixPath('/home/jupyter/gcp/cbidmltsf/data/raw/hourly/CPE04015.parquet/part-00024-4794e0f0-278e-45db-938b-4f17b96b00e7-c000.snappy.parquet'),
 PosixPath('/home/jupyter/gcp/cbidmltsf/data/raw/hourly/CPE04015.parquet/part-00000-4794e0f0-278e-45db-938b-4f17b96b00e7-c000.snappy.parquet')]

In [20]:
# how to read a set of parquet files by pointing at the folder containing them?
# start passing equipment id, time resolution, and a list of variables as criteria
# add date interval later...
def load_equipment(path, equipment, resolution, variables):
    # build the path to data
    data_path = '{0}/{1}/{2}.parquet'.format(path,
                                             resolution,
                                             equipment)
    # get the corresponding Posix path
    path_to_equipment_dir = Path(data_path)

    # rglob: recursively explores all subdirectories in main directory
    # and collect the selected resolution parquet files
    # timestamp must be always extracted!!!
    columns = ['timestamp'] + variables
    data = pd.concat(
        pd.read_parquet(parquet_file, columns=columns,
                        engine='pyarrow')
        for parquet_file in path_to_equipment_dir.rglob('*.parquet')
    )

    # need to change timestamp column from string to datetime
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    # sort values by timestamp
    data = data.sort_values(by=['timestamp'])
    # re-index data on timestamp column
    data = data.set_index('timestamp')
    return data

In [21]:
# build equipment dataframe
equipment_df = load_equipment(RAW_DATA_DIR,
                              experiment['description']['equipment'],
                              experiment['description']['resolution'],
                              experiment['description']['variables'])

In [22]:
# the resulting Pandas dataframe contains the complete dataset
equipment_df

Unnamed: 0_level_0,desbI
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,0.009983
2016-01-01 01:00:00,0.010467
2016-01-01 02:00:00,0.010833
2016-01-01 03:00:00,0.010783
2016-01-01 04:00:00,0.010267
...,...
2018-08-09 06:00:00,7.282050
2018-08-09 07:00:00,6.001550
2018-08-09 08:00:00,5.907033
2018-08-09 09:00:00,6.906083


In [23]:
# manually get a data set with adequate behavior for the experiment
# no regime change, no visible outlliers
# ToDo: use number of row or index value
# ci_df = equipment_df.iloc[8783:]['desbI']
# uncomment the following line to get a dataframe based on the time interval in the experiment dictionary

# date interval is defined and persisted as string, then change it to datetime
start = pd.to_datetime(experiment['description']['start'])
end = pd.to_datetime(experiment['description']['end'])
ci_df = equipment_df.loc[start:end]
# uncomment the following line to get a series
# ci_df = equipment_df.loc['2017-01-01 00:00:00':]['desbI']
ci_df

Unnamed: 0_level_0,desbI
timestamp,Unnamed: 1_level_1
2017-04-01 00:00:00,2.112517
2017-04-01 01:00:00,1.454733
2017-04-01 02:00:00,1.950333
2017-04-01 03:00:00,2.809450
2017-04-01 04:00:00,2.415650
...,...
2018-02-28 19:00:00,9.289533
2018-02-28 20:00:00,9.598533
2018-02-28 21:00:00,6.592383
2018-02-28 22:00:00,6.462183


In [24]:
# change the column data source to the final data set
source = ColumnDataSource(data=ci_df)

In [25]:
# how to get the required timestamp format from string?
str(source.data['timestamp'][0])[:19]

'2017-04-01T00:00:00'

In [26]:
# plot the desbI time series
desbI = figure(title='Current imbalance for {0}, from {1} to {2}'.format(experiment['description']['equipment'],
                                                                        str(source.data['timestamp'][0])[:19],
                                                                        str(source.data['timestamp'][-1])[:19]),
               background_fill_color='#E8DDCB',
               plot_width=800, plot_height=400, x_axis_type='datetime')

desbI.line(source.data['timestamp'], source.data['desbI'], line_color="#4444D9",
           line_width=1, alpha=0.7, legend="desbI")

desbI_threshold = Span(location=15.0, dimension='width', line_color='red', line_width=1)

desbI.add_layout(desbI_threshold)

desbI.legend.location = "center_right"
desbI.legend.background_fill_color = "darkgrey"
desbI.xaxis.axis_label = 'Timestamp'
desbI.yaxis.axis_label = 'Current Imbalance [%]'

show(desbI)



In [27]:
# produce a second Pandas dataframe for desbI scaled values
ci_df_scaled = ci_df.copy()

In [28]:
# Scale the time series to [0, 1]
scaler = MinMaxScaler(feature_range=(0, 1))
ci_df_scaled['desbI'] = scaler.fit_transform(np.array(ci_df['desbI']).reshape(-1, 1))
# ci_df_scaled

In [29]:
# ToDo: refine scaler name, add variables in variables list

In [30]:
# persist fitted scaler to disk
scaler_path = '../data/scalers/LSTM_scaler_desbI_{0}.save'.format(experiment['description']['equipment'])
joblib.dump(scaler, scaler_path)

['../data/scalers/LSTM_scaler_desbI_CPE04015.save']

In [31]:
# a ColumnDataSource object for plots over scaled dataframe
source_scaled = ColumnDataSource(data=ci_df_scaled)

In [32]:
# plot scaled active power, directly from NumPy array
ci_scaled_fig = figure(title='Scaled current imbalance for {0}, from {1} to {2}'.format(experiment['description']['equipment'],
                                                                                        str(source.data['timestamp'][0])[:19],
                                                                                        str(source.data['timestamp'][-1])[:19]),
                       background_fill_color='#E8DDCB',
                       plot_width=800, plot_height=400, x_axis_type='datetime')

ci_scaled_fig.line(source_scaled.data['timestamp'], source_scaled.data['desbI'], line_color='blue',
                      line_width=1, alpha=0.7, legend='Scaled Current Imbalance')

ci_scaled_fig.legend.location = "top_right"
ci_scaled_fig.legend.background_fill_color = "darkgrey"

ci_scaled_fig.xaxis.axis_label = 'Timestamp'
ci_scaled_fig.yaxis.axis_label = 'Scaled Current Imbalance [unit]'

show(ci_scaled_fig)



In [33]:
# process the time series to get the full experiment dictionary
# ToDo: should the experiment dictionary store information about train, eval, predict stages?

In [34]:
# get some values from the selected time series
ci_df_scaled['desbI'][:5]

timestamp
2017-04-01 00:00:00    0.131466
2017-04-01 01:00:00    0.090531
2017-04-01 02:00:00    0.121373
2017-04-01 03:00:00    0.174837
2017-04-01 04:00:00    0.150330
Name: desbI, dtype: float64

In [35]:
# timestamp is required for correct alignment!
# ToDo: add timestamp to either features or labels
# also arrange dataset split criteria for training stages into the experiment dictionary

In [36]:
# modify makeXyMulti to work only on lists until TFRecords are created
# avoid NumPy arrays

In [37]:
test_time_series = ci_df_scaled['desbI'][0:5]
test_time_series

timestamp
2017-04-01 00:00:00    0.131466
2017-04-01 01:00:00    0.090531
2017-04-01 02:00:00    0.121373
2017-04-01 03:00:00    0.174837
2017-04-01 04:00:00    0.150330
Name: desbI, dtype: float64

In [38]:
# split data set into train/validation/test at time series level
# to avoid data overlapping between train/eval/test SLDBs

In [39]:
# global data set is splitted into train, validation, and test data sets, initially at (0.7, 0.2, 0.1)
split = np.array([0.7, 0.9, 1.0])

In [40]:
# get indexes of ci_scaled time series for train, validation, and test thresholds
train_eval_limit = np.int(ci_df_scaled.count()*split[0])
eval_test_limit = np.int(ci_df_scaled.count()*split[1])
train_eval_limit, eval_test_limit

(5610, 7213)

In [41]:
# keep the train, eval, and test segments of time series in a dictionary

In [42]:
ts = dict()

In [43]:
# get the time series portion for train set
ts['train'] = ci_df_scaled[:train_eval_limit]
print('{0} lectures in train time series from {1} to {2}'.format(ts['train'].count()[0],
                                                                 ts['train'].index[0],
                                                                 ts['train'].index[-1]))

5610 lectures in train time series from 2017-04-01 00:00:00 to 2017-11-20 18:00:00


In [44]:
ts['train']

Unnamed: 0_level_0,desbI
timestamp,Unnamed: 1_level_1
2017-04-01 00:00:00,0.131466
2017-04-01 01:00:00,0.090531
2017-04-01 02:00:00,0.121373
2017-04-01 03:00:00,0.174837
2017-04-01 04:00:00,0.150330
...,...
2017-11-20 14:00:00,0.563751
2017-11-20 15:00:00,0.660668
2017-11-20 16:00:00,0.545700
2017-11-20 17:00:00,0.544180


In [45]:
# get the time series portion for eval set
ts['eval'] = ci_df_scaled[train_eval_limit:eval_test_limit]
print('{0} lectures in eval time series from {1} to {2}'.format(ts['eval'].count()[0],
                                                                ts['eval'].index[0],
                                                                ts['eval'].index[-1]))

1603 lectures in eval time series from 2017-11-20 19:00:00 to 2018-01-26 13:00:00


In [46]:
# get the time series portion for test set
ts['test'] = ci_df_scaled[eval_test_limit:]
print('{0} lectures in test time series from {1} to {2}'.format(ts['test'].count()[0],
                                                                ts['test'].index[0],
                                                                ts['test'].index[-1]))

802 lectures in test time series from 2018-01-26 14:00:00 to 2018-02-28 23:00:00


In [47]:
### Define training, validation, and test (based on timestamp interval)

In [48]:
def make_features_labels_timestamps(time_series, m, tau, n_targets):
    """
    Input: 
           time series: original time series
           m: embedding dimension
           tau: lag
    Output: 
           features: list of features
           labels: list of labels
           timestamps: list of target (label) timestamps
    """
    # a couple of empty lists to store feature vectors and targets
    features = []
    labels = []
    timestamps = []
    sequence = range(m*tau, time_series.shape[0] - n_targets + 1)
    for i in sequence:
        features.append(list(time_series.iloc[(i - m*tau):i:tau]))
        labels.append(list(time_series.iloc[i:(i + n_targets):1]))
        timestamps.append(list(time_series.index[i:(i + n_targets):1]))
      
    # uncomment the following line to return NumPy arrays instead of Python lists
    # features, labels, timestamps = np.array(features), np.array(labels), np.array(timestamps)
        
    return features, labels, timestamps

In [None]:
def make_features_labels_timestamps_ohvs(time_series, m, tau, n_targets):
    """
    Input: 
           time series: original time series
           m: embedding dimension
           tau: lag
    Output: 
           features: list of features
           labels: list of labels
           timestamps: list of target (label) timestamps
           oh_wds: list of one-hot vectors describing weekday of timestamp
           oh_dhs: list of one-hot vectors describing hour of the day of timestamp
    """
    # a couple of empty lists to store feature vectors and targets
    features = []
    labels = []
    timestamps = []
    oh_wds = []
    oh_dhs = []
    sequence = range(m*tau, time_series.shape[0] - n_targets + 1)
    for i in sequence:
        features.append(list(time_series.iloc[(i - m*tau):i:tau]))
        labels.append(list(time_series.iloc[i:(i + n_targets):1]))
        timestamps.append(list(time_series.index[i:(i + n_targets):1]))
      
    # uncomment the following line to return NumPy arrays instead of Python lists
    # features, labels, timestamps = np.array(features), np.array(labels), np.array(timestamps)
        
    return features, labels, timestamps

In [61]:
# create a dictionary to temporarily store the following SLDBs:
# train (hourly, daily, weekly, targets, timestamps)
# eval (hourly, daily, weekly, targets, timestamps)
# test (hourly, daily, weekly, targets, timestamps)

In [63]:
sldb = {
    'train': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    },
    'eval': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    },
    'test': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    }
}

In [64]:
stages = ['train', 'eval', 'test']

In [66]:
variable = experiment['description']['variables'][0]
variable

'desbI'

In [67]:
ts['train'][variable]

timestamp
2017-04-01 00:00:00    0.131466
2017-04-01 01:00:00    0.090531
2017-04-01 02:00:00    0.121373
2017-04-01 03:00:00    0.174837
2017-04-01 04:00:00    0.150330
                         ...   
2017-11-20 14:00:00    0.563751
2017-11-20 15:00:00    0.660668
2017-11-20 16:00:00    0.545700
2017-11-20 17:00:00    0.544180
2017-11-20 18:00:00    0.510575
Name: desbI, Length: 5610, dtype: float64

In [68]:
# BUILD ALL THE SLDBs!!!
for stage in stages:
    for component_key in experiment['components'].keys():
        sldb[stage][component_key]['features'], sldb[stage][component_key]['labels'], sldb[stage][component_key]['timestamps'] = make_features_labels_timestamps(
            ts[stage][variable],
            experiment['components'][component_key]['m'],
            experiment['components'][component_key]['tau'],
            experiment['components'][component_key]['no_targets'])

In [76]:
intervals = ['hourly', 'daily', 'weekly']

In [77]:
items = ['features', 'labels', 'timestamps']

In [88]:
# define the structure of a useful dictionary
experiment['stats'] = {
    'train': {
        'hourly': {},
        'daily': {},
        'weekly': {},
    },
    'eval': {
        'hourly': {},
        'daily': {},
        'weekly': {}        
    },
    'test': {
        'hourly': {},
        'daily': {},
        'weekly': {}
    }
}

In [102]:
# report statistics on stages and intervals of SLDBs
# and persist them to the experiment['stats'] level
for stage in stages:
    for interval in intervals:
        for item in items:
            # fill the values in the stats sub-dictionary
            experiment['stats'][stage][interval][item] = len(sldb[stage][interval][item])
            experiment['stats'][stage][interval]['from'] = sldb[stage][interval]['timestamps'][0][0]
            experiment['stats'][stage][interval]['to'] = sldb[stage][interval]['timestamps'][-1][0]           
            # and log them
            print('{0} {3} / {1} / {2} from {4} to {5}'.format(len(sldb[stage][interval][item]),
                                                               stage,
                                                               interval,
                                                               item,
                                                               sldb[stage][interval]['timestamps'][0][0],
                                                               sldb[stage][interval]['timestamps'][-1][0]))

5586 features / train / hourly from 2017-04-02 00:00:00 to 2017-11-20 18:00:00
5586 labels / train / hourly from 2017-04-02 00:00:00 to 2017-11-20 18:00:00
5586 timestamps / train / hourly from 2017-04-02 00:00:00 to 2017-11-20 18:00:00
5442 features / train / daily from 2017-04-08 01:00:00 to 2017-11-20 18:00:00
5442 labels / train / daily from 2017-04-08 01:00:00 to 2017-11-20 18:00:00
5442 timestamps / train / daily from 2017-04-08 01:00:00 to 2017-11-20 18:00:00
4938 features / train / weekly from 2017-04-29 01:00:00 to 2017-11-20 18:00:00
4938 labels / train / weekly from 2017-04-29 01:00:00 to 2017-11-20 18:00:00
4938 timestamps / train / weekly from 2017-04-29 01:00:00 to 2017-11-20 18:00:00
1579 features / eval / hourly from 2017-11-21 19:00:00 to 2018-01-26 13:00:00
1579 labels / eval / hourly from 2017-11-21 19:00:00 to 2018-01-26 13:00:00
1579 timestamps / eval / hourly from 2017-11-21 19:00:00 to 2018-01-26 13:00:00
1435 features / eval / daily from 2017-11-27 19:00:00 to 2

In [119]:
# in train set, verify resolution-based datasets end in the same timestamp
experiment['stats']['train']['hourly']['to'] == experiment['stats']['train']['daily']['to'] == experiment['stats']['train']['weekly']['to']

True

In [120]:
# in eval set, verify resolution-based datasets end in the same timestamp
experiment['stats']['eval']['hourly']['to'] == experiment['stats']['eval']['daily']['to'] == experiment['stats']['eval']['weekly']['to']

True

In [121]:
# in test set, verify resolution-based datasets end in the same timestamp
experiment['stats']['test']['hourly']['to'] == experiment['stats']['test']['daily']['to'] == experiment['stats']['test']['weekly']['to']

True

In [132]:
# get the number of rows in the smaller resolution-based dataset, for alignment purposes
for stage in stages:
    experiment['stats'][stage]['trimmed_to'] = min([experiment['stats'][stage][interval]['features'] for interval in intervals])

In [143]:
experiment['stats']['train']['trimmed_to'], experiment['stats']['eval']['trimmed_to'], experiment['stats']['test']['trimmed_to']

(4938, 931, 130)

In [136]:
stages, intervals, items

(['train', 'eval', 'test'],
 ['hourly', 'daily', 'weekly'],
 ['features', 'labels', 'timestamps'])

In [138]:
# a new dictionary with final, trimmed data
tfrecords = {
    'train': {}, # hourly, daily, weekly, labels, timestamps to be added
    'eval': {}, # hourly, daily, weekly, labels, timestamps to be added
    'test': {}, # hourly, daily, weekly, labels, timestamps to be added
}

In [151]:
for stage in stages:
    # isolate this value, just for readability
    value_to_trim = experiment['stats'][stage]['trimmed_to']
    tfrecords[stage]['hourly'] = sldb[stage]['hourly']['features'][-value_to_trim:]
    tfrecords[stage]['daily'] = sldb[stage]['daily']['features'][-value_to_trim:]
    tfrecords[stage]['weekly'] = sldb[stage]['weekly']['features'][-value_to_trim:]
    # labels and timestamps can be acquired from any resolution-based, temporary dataset (hourly, daily, weekly)
    tfrecords[stage]['labels'] = sldb[stage]['hourly']['labels'][-value_to_trim:]
    tfrecords[stage]['timestamps'] = sldb[stage]['hourly']['timestamps'][-value_to_trim:]

In [66]:
# build and serialize examples

In [163]:
# get the first row (list) of features at hourly resolution to test the functions already built
list_of_features = tfrecords['train']['hourly'][0]

In [165]:
list_of_features

[0.23604218072742778,
 0.21965654258679004,
 0.20584419019288747,
 0.20633893315991397,
 0.23554847495739714,
 0.3069854180474351,
 0.3476736189981299,
 0.3996288909148803,
 0.5794383993146202,
 0.3218795669080224,
 0.28891537198551653,
 0.40595164380165893,
 0.33396706069780546,
 0.3624308578552633,
 0.6737797636642926,
 0.5972719644614821,
 0.3740391666329578,
 0.2004912164972405,
 0.3055240074802647,
 0.35227877365975996,
 0.5433854317384356,
 0.40607714463815825,
 0.3015619149560695,
 0.24656869303843745]

In [166]:
# starting this point, transfer code to JupyterLab to test it on TensorFlow 1.15 Enterprise

In [167]:
import tensorflow as tf

In [168]:
# modified to run on TF 2.X
sess = tf.compat.v1.InteractiveSession()

In [169]:
def _float_feature_from_list_of_values(list_of_values):
  """Returns a float_list from a list of floats / doubles."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [170]:
# the first features row in training set (24 lectures) as a float list
hourly = _float_feature_from_list_of_values(list_of_features)
hourly

float_list {
  value: 0.23604218661785126
  value: 0.2196565419435501
  value: 0.2058441936969757
  value: 0.20633892714977264
  value: 0.2355484813451767
  value: 0.30698540806770325
  value: 0.347673624753952
  value: 0.3996288776397705
  value: 0.5794383883476257
  value: 0.3218795657157898
  value: 0.28891536593437195
  value: 0.4059516489505768
  value: 0.33396705985069275
  value: 0.3624308705329895
  value: 0.6737797856330872
  value: 0.5972719788551331
  value: 0.3740391731262207
  value: 0.20049121975898743
  value: 0.3055240213871002
  value: 0.35227876901626587
  value: 0.5433854460716248
  value: 0.40607714653015137
  value: 0.3015619218349457
  value: 0.2465686947107315
}

In [171]:
# get the first row (list) of labels at hourly resolution to test the functions already built
list_of_labels = tfrecords['train']['labels'][0]

In [172]:
# the first label row in training set as a float list
target = _float_feature_from_list_of_values(list_of_labels)
target

float_list {
  value: 0.18826058506965637
}

In [193]:
# get the first row (list) of timestamps at hourly resolution to test the functions already built
timestamp_demo = tfrecords['train']['timestamps'][0][0]
timestamp_demo

Timestamp('2017-04-29 01:00:00')

In [188]:
# one-hot encode here and persist two extra features (_int or _byte?) to tfrecord files

In [189]:
# recover the one-hot encoding process

In [190]:
# from estimator/create_datasets_as_numpy.ipynb

In [191]:
# build timestamp-related features vectors:
# X_ts_weekday: one-hot encoder for weekday identification
# X_ts_hour: one-hot encoder for hour identification
# over train, val, and test target timestamps

In [195]:
def one_hot_encode(timestamp):
    fv_weekday = np.zeros(7)
    fv_hour = np.zeros(24)
    fv_weekday[timestamp.weekday()] = 1.
    fv_hour[timestamp.hour] = 1.
    return list(fv_weekday), list(fv_hour)

In [196]:
# 2017-04-29 was Saturday!
one_hot_encode(timestamp_demo)

([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0])

In [None]:
# the first timestamp row in training set as a string list
target = _float_feature_from_list_of_values(list_of_labels)
target

In [175]:
# create a TFRecord example for just the first row in the dataset
example = tf.train.Example(
    # features within the example
    features=tf.train.Features(
        # individual feature definition
        feature={'hourly': _float_feature_from_list_of_values(tfrecords['train']['hourly'][0]),
                 'daily': _float_feature_from_list_of_values(tfrecords['train']['daily'][0]),
                 'weekly': _float_feature_from_list_of_values(tfrecords['train']['weekly'][0]),
                 'target': _float_feature_from_list_of_values(tfrecords['train']['labels'][0]),
                 # timestamps to be incorporated later as _byte_feature???
                 # or passed as one-hot encoded vectors: hour of the day (0-23) / day of the week (0-6)
                 # 'timestamp': _float_feature_from_list_of_values(tfrecords['train']['timestamps'][0])
                }
    )
)

print(example)

features {
  feature {
    key: "daily"
    value {
      float_list {
        value: 0.21911409497261047
        value: 0.20516172051429749
        value: 0.23340561985969543
        value: 0.28076404333114624
        value: 0.2613321542739868
        value: 0.2176402360200882
        value: 0.23604218661785126
      }
    }
  }
  feature {
    key: "hourly"
    value {
      float_list {
        value: 0.23604218661785126
        value: 0.2196565419435501
        value: 0.2058441936969757
        value: 0.20633892714977264
        value: 0.2355484813451767
        value: 0.30698540806770325
        value: 0.347673624753952
        value: 0.3996288776397705
        value: 0.5794383883476257
        value: 0.3218795657157898
        value: 0.28891536593437195
        value: 0.4059516489505768
        value: 0.33396705985069275
        value: 0.3624308705329895
        value: 0.6737797856330872
        value: 0.5972719788551331
        value: 0.3740391731262207
        value: 0.20049121

In [179]:
# serialize the single example
example.SerializeToString()

b'\n\xcf\x01\nn\n\x06hourly\x12d\x12b\n`\x0b\xb5q>\xa5\xed`>\xd2\xc8R>\x83JS>\x9f3q>1-\x9d>G\x02\xb2>(\x9c\xcc>\x13V\x14?f\xcd\xa4>\xb7\xec\x93>\xe5\xd8\xcf>\xbb\xfd\xaa>\x8a\x90\xb9>\xd5|,?\xd1\xe6\x18?\x10\x82\xbf>\x92MM>\xa5m\x9c>\xe2]\xb4>O\x1b\x0b?X\xe9\xcf>Sf\x9a>\x81||>\n)\n\x05daily\x12 \x12\x1e\n\x1cr_`>\xea\x15R>\xe2\x01o>N\xc0\x8f>T\xcd\x85>\x15\xdd^>\x0b\xb5q>\n\x1e\n\x06weekly\x12\x14\x12\x12\n\x10\xf6\x9e\x06>rg\x97>^p\xa8>r_`>\n\x12\n\x06target\x12\x08\x12\x06\n\x04b\xc7@>'

In [180]:
# write the basic `tf.Example` to a file
with tf.io.TFRecordWriter('../data/tfrecord/first_row.tfrecord') as writer:
    serialized_example = example.SerializeToString()
    writer.write(serialized_example)

In [181]:
_float_feature_from_list_of_values# now read the dataset from TFRecord file using non-deprecated methods from tf.data module
first_row_train_raw_dataset = tf.compat.v1.data.TFRecordDataset('../data/tfrecord/first_row.tfrecord')
first_row_train_raw_dataset

<TFRecordDatasetV1 shapes: (), types: tf.string>

In [182]:
# first_row_train_raw_dataset.output_types
tf.compat.v1.data.get_output_types(first_row_train_raw_dataset)

tf.string

In [183]:
# first_row_train_raw_dataset.output_shapes
tf.compat.v1.data.get_output_shapes(first_row_train_raw_dataset)

TensorShape([])

In [184]:
# can I access to the binary, string-based, raw dataset using a one-shot iterator?
iterator = first_row_train_raw_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

# there is only one row in the raw dataset
value = sess.run(next_element)
print(value)

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
b'\n\xcf\x01\nn\n\x06hourly\x12d\x12b\n`\x0b\xb5q>\xa5\xed`>\xd2\xc8R>\x83JS>\x9f3q>1-\x9d>G\x02\xb2>(\x9c\xcc>\x13V\x14?f\xcd\xa4>\xb7\xec\x93>\xe5\xd8\xcf>\xbb\xfd\xaa>\x8a\x90\xb9>\xd5|,?\xd1\xe6\x18?\x10\x82\xbf>\x92MM>\xa5m\x9c>\xe2]\xb4>O\x1b\x0b?X\xe9\xcf>Sf\x9a>\x81||>\n)\n\x05daily\x12 \x12\x1e\n\x1cr_`>\xea\x15R>\xe2\x01o>N\xc0\x8f>T\xcd\x85>\x15\xdd^>\x0b\xb5q>\n\x1e\n\x06weekly\x12\x14\x12\x12\n\x10\xf6\x9e\x06>rg\x97>^p\xa8>r_`>\n\x12\n\x06target\x12\x08\x12\x06\n\x04b\xc7@>'


In [84]:
# persist examples as tfrecord files

In [86]:
# based on lstm_estimator_11_build_tfrecord_files.ipynb

In [185]:
experiment

{'components': {'daily': {'m': 7, 'no_targets': 1, 'tau': 24},
  'hourly': {'m': 24, 'no_targets': 1, 'tau': 1},
  'weekly': {'m': 4, 'no_targets': 1, 'tau': 168}},
 'description': {'end': '2018-02-28 23:00:00',
  'equipment': 'CPE04015',
  'resolution': 'hourly',
  'start': '2017-04-01 00:00:00',
  'variables': ['desbI']},
 'stats': {'eval': {'daily': {'features': 1435,
    'from': Timestamp('2017-11-27 19:00:00'),
    'labels': 1435,
    'timestamps': 1435,
    'to': Timestamp('2018-01-26 13:00:00')},
   'hourly': {'features': 1579,
    'from': Timestamp('2017-11-21 19:00:00'),
    'labels': 1579,
    'timestamps': 1579,
    'to': Timestamp('2018-01-26 13:00:00')},
   'to_trim': 931,
   'trimmed_to': 931,
   'weekly': {'features': 931,
    'from': Timestamp('2017-12-18 19:00:00'),
    'labels': 931,
    'timestamps': 931,
    'to': Timestamp('2018-01-26 13:00:00')}},
  'test': {'daily': {'features': 634,
    'from': Timestamp('2018-02-02 14:00:00'),
    'labels': 634,
    'timestamps

In [186]:
# persist the final, compact dictionary to JSON
with open('experiment.json', 'w') as filename:
    json.dump(experiment, filename, indent=4)

TypeError: Timestamp('2017-11-20 18:00:00') is not JSON serializable

In [None]:
# upload a folder with tfrecord and json files to Google Storage