In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# retrieve data sets from data arrays path
Xadj_train = np.load('../data/arrays/Xadj_train.npy')
y_train = np.load('../data/arrays/y_train.npy')

In [3]:
Xadj_train.shape, y_train.shape

((8995, 24, 1), (8995, 168))

In [4]:
# y_train array keeps 168 possible targets, one for each hour in the week
# get only targets for the first model (first-step-ahead)
n = 0
ytarget_train = y_train[:,n]

In [5]:
# required for input_fn's
ytarget_train = ytarget_train.reshape(ytarget_train.shape[0], 1)

In [6]:
Xadj_train.shape, ytarget_train.shape

((8995, 24, 1), (8995, 1))

In [7]:
# ToDo: verify if labels can be declared as a Feature, as done with the feature vector
# later, now just run with the previous, working code
# def train_input_fn():
#     dataset = (
#         tf.data.Dataset.from_tensor_slices(
#             (
#                 {
#                     'adjacent_hours': tf.cast(Xadj_train, tf.float32),
#                     'labels': tf.cast(ytarget_train, tf.float32) 
#                 }
#             )
#         )
#     )
#     dataset = dataset.shuffle(buffer_size=9000).repeat(count=1).batch(batch_size=32)    
#     return dataset

In [8]:
def train_input_fn():
    dataset = (
        tf.data.Dataset.from_tensor_slices(
            (
                {
                    'adjacent_hours': tf.cast(Xadj_train, tf.float32)
                },
                tf.cast(ytarget_train, tf.float32)
            )
        )
    )
    dataset = dataset.shuffle(buffer_size=9000).repeat(count=1).batch(batch_size=32)    
    return dataset

In [9]:
train_input_fn().map(lambda features, labels: features['adjacent_hours'])

<MapDataset shapes: (?, 24, 1), types: tf.float32>

In [10]:
# assign the dataset to a variable, then try to save it in TFRecord file format
dataset = train_input_fn()

In [11]:
# dataset is ready to be saved into a TFRecord file
dataset

<BatchDataset shapes: ({adjacent_hours: (?, 24, 1)}, (?, 1)), types: ({adjacent_hours: tf.float32}, tf.float32)>

In [12]:
def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

In [13]:
# train.FloatList(value=[value]) requires a list with the targets on the inner value
# verify NumPy array to list

In [14]:
Xadj_train[0]

array([[0.58097166],
       [0.62753036],
       [0.66801619],
       [0.65384615],
       [0.67611336],
       [0.74089069],
       [0.68623482],
       [0.57692308],
       [0.55263158],
       [0.53036437],
       [0.58704453],
       [0.51214575],
       [0.46153846],
       [0.55668016],
       [0.62550607],
       [0.64979757],
       [0.63562753],
       [0.63562753],
       [0.50202429],
       [0.37854251],
       [0.36032389],
       [0.46153846],
       [0.47773279],
       [0.53238866]])

In [15]:
list_of_values = [lecture[0] for lecture in Xadj_train[0]]
list_of_values

[0.5809716599190284,
 0.6275303643724696,
 0.6680161943319839,
 0.6538461538461539,
 0.6761133603238868,
 0.7408906882591094,
 0.6862348178137652,
 0.576923076923077,
 0.5526315789473685,
 0.5303643724696356,
 0.5870445344129555,
 0.5121457489878541,
 0.4615384615384615,
 0.5566801619433198,
 0.625506072874494,
 0.6497975708502025,
 0.6356275303643725,
 0.6356275303643726,
 0.5020242914979758,
 0.3785425101214574,
 0.36032388663967607,
 0.4615384615384616,
 0.4777327935222673,
 0.5323886639676113]

In [16]:
def _float_feature_from_list_of_values(list_of_values):
  """Returns a float_list from a list of floats / doubles."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [17]:
adjacent_hours = _float_feature_from_list_of_values(list_of_values)
adjacent_hours

float_list {
  value: 0.5809716582298279
  value: 0.6275303363800049
  value: 0.6680161952972412
  value: 0.6538461446762085
  value: 0.6761133670806885
  value: 0.7408906817436218
  value: 0.6862348318099976
  value: 0.5769230723381042
  value: 0.5526315569877625
  value: 0.5303643941879272
  value: 0.5870445370674133
  value: 0.5121457576751709
  value: 0.4615384638309479
  value: 0.5566801428794861
  value: 0.6255060434341431
  value: 0.6497975587844849
  value: 0.6356275081634521
  value: 0.6356275081634521
  value: 0.5020242929458618
  value: 0.3785425126552582
  value: 0.36032387614250183
  value: 0.4615384638309479
  value: 0.4777328073978424
  value: 0.5323886871337891
}

In [18]:
_float_feature(ytarget_train[0])

float_list {
  value: 0.6336032152175903
}

In [19]:
# create a TFRecord example for just the first row in the dataset
example = tf.train.Example(
    # features within the example
    features=tf.train.Features(
        # individual feature definition
        feature={'adjacent_hours': _float_feature_from_list_of_values([lecture[0] for lecture in Xadj_train[0]]),
                 'target': _float_feature(ytarget_train[0]),
                }
    )
)

print(example)

features {
  feature {
    key: "adjacent_hours"
    value {
      float_list {
        value: 0.5809716582298279
        value: 0.6275303363800049
        value: 0.6680161952972412
        value: 0.6538461446762085
        value: 0.6761133670806885
        value: 0.7408906817436218
        value: 0.6862348318099976
        value: 0.5769230723381042
        value: 0.5526315569877625
        value: 0.5303643941879272
        value: 0.5870445370674133
        value: 0.5121457576751709
        value: 0.4615384638309479
        value: 0.5566801428794861
        value: 0.6255060434341431
        value: 0.6497975587844849
        value: 0.6356275081634521
        value: 0.6356275081634521
        value: 0.5020242929458618
        value: 0.3785425126552582
        value: 0.36032387614250183
        value: 0.4615384638309479
        value: 0.4777328073978424
        value: 0.5323886871337891
      }
    }
  }
  feature {
    key: "target"
    value {
      float_list {
        value: 0.6336032

In [20]:
example.SerializeToString()

b'\n\x8c\x01\nv\n\x0eadjacent_hours\x12d\x12b\n`\x8f\xba\x14?\xd4\xa5 ?\x1c\x03+?vb\'?\xc4\x15-?\x03\xab=?\x16\xad/?;\xb1\x13?Cy\r?\xf6\xc5\x07?\x8dH\x16?\xfc\x1b\x03?\xc5N\xec>\x97\x82\x0e?*! ?"Y&?|\xb8"?|\xb8"?\xaa\x84\x00?S\xd0\xc1>_|\xb8>\xc5N\xec>e\x99\xf4>\xa0J\x08?\n\x12\n\x06target\x12\x08\x12\x06\n\x04\xd23"?'

In [21]:
# write the basic `tf.Example` (only the first row of adjacent hours training set) to a file
with tf.io.TFRecordWriter('../data/tfrecord/first_row.tfrecord') as writer:
    serialized_example = example.SerializeToString()
    writer.write(serialized_example)

In [22]:
# so far, so good... now try to read and recover the first row of the dataset
# use deprecated, but functional, code found in
# http://medium.com/mostly-ai/tensorflow-records-what-they-are-and-how-to-use-them-c46bc4bbb564

In [23]:
# read and print data:
# ToDo: the following line might need to be changed when running a script
sess = tf.InteractiveSession()

# read TFRecord file
reader = tf.TFRecordReader()
filename_queue = tf.train.string_input_producer(['../data/tfrecord/first_row.tfrecord'])

_, serialized_example = reader.read(filename_queue)

# define features
read_features = {
    'adjacent_hours': tf.VarLenFeature(dtype=tf.float32),
    'target': tf.FixedLenFeature([], dtype=tf.float32)}

# extract features from serialized data
read_data = tf.parse_single_example(serialized=serialized_example,
                                    features=read_features)

# important! many tf.train functions use tf.train.QueueRunner,
# so we need to start it before we read
tf.train.start_queue_runners(sess)

# Print features
for name, tensor in read_data.items():
    print('{}: {}'.format(name, tensor.eval()))

Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.TFRecordDataset`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(string_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensors(tensor).repeat(num_epochs)`.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To constru

In [36]:
# now read the dataset from TFRecord file using non-deprecated methods from tf.data module
first_row_train_raw_dataset = tf.data.TFRecordDataset('../data/tfrecord/first_row.tfrecord')
first_row_train_raw_dataset

<TFRecordDataset shapes: (), types: tf.string>

In [37]:
first_row_train_raw_dataset.output_types

tf.string

In [38]:
first_row_train_raw_dataset.output_shapes

TensorShape([])

In [39]:
# can I access to the binary, string-based, raw dataset using a one-shot iterator?
iterator = first_row_train_raw_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

# there is only one row in the raw dataset
value = sess.run(next_element)
print(value)

b'\n\x8c\x01\nv\n\x0eadjacent_hours\x12d\x12b\n`\x8f\xba\x14?\xd4\xa5 ?\x1c\x03+?vb\'?\xc4\x15-?\x03\xab=?\x16\xad/?;\xb1\x13?Cy\r?\xf6\xc5\x07?\x8dH\x16?\xfc\x1b\x03?\xc5N\xec>\x97\x82\x0e?*! ?"Y&?|\xb8"?|\xb8"?\xaa\x84\x00?S\xd0\xc1>_|\xb8>\xc5N\xec>e\x99\xf4>\xa0J\x08?\n\x12\n\x06target\x12\x08\x12\x06\n\x04\xd23"?'


In [40]:
# ToDo: put more than one row in the tf.Dataset before serializing it

In [41]:
# it is required to parse the recovered dataset, from string to original types and shapes

In [42]:
# parsing operator without deprecated tr.io methods has failed, review more tutorials tomorrow...

In [43]:
# http://towardsdatascience.com/how-to-use-dataset-in-tensorflow-c758ef9e4428

In [44]:
read_features

{'adjacent_hours': VarLenFeature(dtype=tf.float32),
 'target': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None)}

In [45]:
def _parse_dataset_function(example_proto):
  # parse the input tf.Example proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, read_features)

parsed_dataset = first_row_train_raw_dataset.map(_parse_dataset_function)
parsed_dataset

<MapDataset shapes: {adjacent_hours: (?,), target: ()}, types: {adjacent_hours: tf.float32, target: tf.float32}>

In [46]:
iterator = parsed_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

# there is only one row in the raw dataset
value = sess.run(next_element)
print(value['adjacent_hours'].values, value['adjacent_hours'].values.shape)

[0.58097166 0.62753034 0.6680162  0.65384614 0.67611337 0.7408907
 0.68623483 0.5769231  0.55263156 0.5303644  0.58704454 0.51214576
 0.46153846 0.55668014 0.62550604 0.64979756 0.6356275  0.6356275
 0.5020243  0.3785425  0.36032388 0.46153846 0.4777328  0.5323887 ] (24,)


In [62]:
# TEST WITH DIFFERENT PARSING FUNCTIONS
def _parse_dataset_function(example_proto):
    # parse the input tf.Example proto using the dictionary above
    row = tf.io.parse_single_example(example_proto, read_features)
    adjacent_hours = tf.reshape(row['adjacent_hours'].values, [-1, 1])
    target = tf.reshape(row['target'], [1,])
    
    return ({'adjacent_hours': adjacent_hours}, target)
    

parsed_dataset = first_row_train_raw_dataset.map(_parse_dataset_function)
parsed_dataset

<MapDataset shapes: ({adjacent_hours: (?, 1)}, (1,)), types: ({adjacent_hours: tf.float32}, tf.float32)>

In [63]:
iterator = parsed_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

# there is only one row in the raw dataset
value = sess.run(next_element)
print(value)

({'adjacent_hours': array([[0.58097166],
       [0.62753034],
       [0.6680162 ],
       [0.65384614],
       [0.67611337],
       [0.7408907 ],
       [0.68623483],
       [0.5769231 ],
       [0.55263156],
       [0.5303644 ],
       [0.58704454],
       [0.51214576],
       [0.46153846],
       [0.55668014],
       [0.62550604],
       [0.64979756],
       [0.6356275 ],
       [0.6356275 ],
       [0.5020243 ],
       [0.3785425 ],
       [0.36032388],
       [0.46153846],
       [0.4777328 ],
       [0.5323887 ]], dtype=float32)}, array([0.6336032], dtype=float32))


In [49]:
parsed_dataset = first_row_train_raw_dataset.map(lambda example_proto: tf.io.parse_single_example(example_proto,
                                                                                 read_features))
parsed_dataset.output_shapes, parsed_dataset.output_types

({'adjacent_hours': TensorShape([Dimension(None)]), 'target': TensorShape([])},
 {'adjacent_hours': tf.float32, 'target': tf.float32})

In [50]:
iterator = parsed_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

# there is only one row in the raw dataset
value = sess.run(next_element)
print(value['adjacent_hours'].values, value['adjacent_hours'].values.shape)

[0.58097166 0.62753034 0.6680162  0.65384614 0.67611337 0.7408907
 0.68623483 0.5769231  0.55263156 0.5303644  0.58704454 0.51214576
 0.46153846 0.55668014 0.62550604 0.64979756 0.6356275  0.6356275
 0.5020243  0.3785425  0.36032388 0.46153846 0.4777328  0.5323887 ] (24,)


In [51]:
parsed_dataset = first_row_train_raw_dataset.map(lambda example_proto: tf.io.parse_single_example(example_proto,
                                                                                                  read_features)['adjacent_hours'].values)

parsed_dataset.output_shapes, parsed_dataset.output_types

(TensorShape([Dimension(None)]), tf.float32)

In [52]:
iterator = parsed_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

# there is only one row in the raw dataset
value = sess.run(next_element)
print(value, value.shape)
# print(value.reshape(-1, 1))

[0.58097166 0.62753034 0.6680162  0.65384614 0.67611337 0.7408907
 0.68623483 0.5769231  0.55263156 0.5303644  0.58704454 0.51214576
 0.46153846 0.55668014 0.62550604 0.64979756 0.6356275  0.6356275
 0.5020243  0.3785425  0.36032388 0.46153846 0.4777328  0.5323887 ] (24,)


In [63]:
parsed_array = first_row_train_raw_dataset.map(lambda example_proto: tf.io.parse_single_example(example_proto,
                                                                                                  read_features)['adjacent_hours'].values)


In [64]:
iterator = parsed_array.make_one_shot_iterator()
next_element = iterator.get_next()

# there is only one row in the raw dataset
value = sess.run(next_element)
print(value, value.shape)
# print(value.reshape(-1, 1))

[0.58097166 0.62753034 0.6680162  0.65384614 0.67611337 0.7408907
 0.68623483 0.5769231  0.55263156 0.5303644  0.58704454 0.51214576
 0.46153846 0.55668014 0.62550604 0.64979756 0.6356275  0.6356275
 0.5020243  0.3785425  0.36032388 0.46153846 0.4777328  0.5323887 ] (24,)


In [40]:
parsed_dataset.map(lambda features: features['target'])

TypeError: must be str, not int

In [42]:
iterator = parsed_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

# there is only one row in the raw dataset
value = sess.run(next_element)
print(value['adjacent_hours'].values, value['adjacent_hours'].values.shape)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [43]:
parsed_array = value['adjacent_hours'].values

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [44]:
parsed_array.reshape(-1, 1)

NameError: name 'parsed_array' is not defined

In [37]:
parsed_dataset.output_types

{'adjacent_hours': tf.float32, 'target': tf.float32}

In [38]:
parsed_dataset.output_shapes

{'adjacent_hours': TensorShape([Dimension(None)]), 'target': TensorShape([])}

In [41]:
# so far, so good, now compare the previous result with the output of ...from_tensor_slices