In [1]:
# Learning tensorflow data

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
# Create dummy data
array_a = np.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
array_b = np.array([-1, -2, -3, -4, -5])
print(array_a)
print(array_b)

[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
[-1 -2 -3 -4 -5]


In [4]:
# Define tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices((array_a, array_b))
for a,b in dataset:
    print(a.numpy(), ' , ', b.numpy())

[0 1]  ,  -1
[2 3]  ,  -2
[4 5]  ,  -3
[6 7]  ,  -4
[8 9]  ,  -5


In [5]:
# Preprocessing: Apply Python functions
def tf_some_python_function(input_a, input_b):
    [input_a,] = tf.py_function(np.log1p, [input_a], [tf.float32])
    return input_a, input_b

mapped_dataset = dataset.map(tf_some_python_function)

for a,b in mapped_dataset:
    print(a.numpy(), ' , ', b.numpy())

[0.        0.6931472]  ,  -1
[1.0986123 1.3862944]  ,  -2
[1.609438  1.7917595]  ,  -3
[1.9459101 2.0794415]  ,  -4
[2.1972246 2.3025851]  ,  -5


In [6]:
# Batches
batch_size = 2
take_n_batch = 5

print('\nBatched data, keep partial:')            
batched_dataset = dataset.batch(batch_size, drop_remainder=False)

for i, batch in enumerate(batched_dataset.take(take_n_batch)):
    print('batch', i)
    for arr in batch:
        print(arr.numpy())
        
print('\nBatched data, keep partial, repeat:')            
batched_dataset = dataset.batch(batch_size, drop_remainder=False)
batched_dataset = batched_dataset.repeat()

for i, batch in enumerate(batched_dataset.take(take_n_batch)):
    print('batch', i)
    for arr in batch:
        print(arr.numpy())


Batched data, keep partial:
batch 0
[[0 1]
 [2 3]]
[-1 -2]
batch 1
[[4 5]
 [6 7]]
[-3 -4]
batch 2
[[8 9]]
[-5]

Batched data, keep partial, repeat:
batch 0
[[0 1]
 [2 3]]
[-1 -2]
batch 1
[[4 5]
 [6 7]]
[-3 -4]
batch 2
[[8 9]]
[-5]
batch 3
[[0 1]
 [2 3]]
[-1 -2]
batch 4
[[4 5]
 [6 7]]
[-3 -4]


In [7]:
# Shuffle
# shuffle, batch, repeat order should be kept
shuffled = dataset.shuffle(buffer_size=100)
shuffled = shuffled.batch(batch_size)
shuffled = shuffled.repeat()
for i, batch in enumerate(shuffled.take(take_n_batch)):
    print('batch', i)
    for arr in batch:
        print(arr.numpy())

batch 0
[[8 9]
 [2 3]]
[-5 -2]
batch 1
[[0 1]
 [6 7]]
[-1 -4]
batch 2
[[4 5]]
[-3]
batch 3
[[0 1]
 [4 5]]
[-1 -3]
batch 4
[[6 7]
 [8 9]]
[-4 -5]


In [8]:
# Timeseries forecasting

In [9]:
# Simple rolling window
timeseries = tf.data.Dataset.range(100000)

batches = timeseries.batch(10, drop_remainder=True)

for batch in batches.take(5):
    print(batch.numpy())

[0 1 2 3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17 18 19]
[20 21 22 23 24 25 26 27 28 29]
[30 31 32 33 34 35 36 37 38 39]
[40 41 42 43 44 45 46 47 48 49]


In [10]:
# Rolling window, window = feature data + data to be predicted
def label_next_5_steps(batch):
    return (batch[:-5],   # Take the first 5 steps
            batch[-5:])   # take the remainder

predict_5_steps = batches.map(label_next_5_steps)

for features, label in predict_5_steps.take(3):
    print(features.numpy(), " => ", label.numpy())


[0 1 2 3 4]  =>  [5 6 7 8 9]
[10 11 12 13 14]  =>  [15 16 17 18 19]
[20 21 22 23 24]  =>  [25 26 27 28 29]


In [11]:
# Overlapping rolling window, window = feature data + data to be predicted
feature_length = 3
label_length = 2

features = timeseries.batch(feature_length, drop_remainder=True)
labels = timeseries.batch(feature_length).skip(1).map(lambda labels: labels[:label_length])

predict_5_steps = tf.data.Dataset.zip((features, labels))

for features, label in predict_5_steps.take(3):
    print(features.numpy(), " => ", label.numpy())

[0 1 2]  =>  [3 4]
[3 4 5]  =>  [6 7]
[6 7 8]  =>  [ 9 10]
