In [3]:
import tensorflow as tf
dataset = tf.data.Dataset.range(100)

# split into windows and flatten
WINDOW_SIZE = 10
dataset = dataset.window(WINDOW_SIZE+1, shift=1, drop_remainder=True)
dataset_flat = dataset.flat_map(lambda window: window.batch(WINDOW_SIZE+1))

# split into x and y
dataset_flat = dataset_flat.map(lambda window: (window[:-1], window[-1:]))

# our data has 1st order array dimensionality, a.k.a. not specifically defined
# shape=(10,)
next(iter(dataset_flat))[0]

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)>

In [116]:
# we can use tf.reshape() or tf.expand_dims() to change tensor shapes
tf.reshape(next(iter(dataset_flat))[0], [-1, 2])

<tf.Tensor: shape=(5, 2), dtype=int64, numpy=
array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]], dtype=int64)>

In [76]:
# in case of univariate data (feature dimensionality == 1) we can use expand_dims
# to create 3d data suitable for tf.keras.layers.LSTM()
tf.expand_dims(next(iter(dataset_flat))[0], axis=-1)

<tf.Tensor: shape=(10, 1), dtype=int64, numpy=
array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]], dtype=int64)>

## Univariate data

In [108]:
# and apply it in a systematic way using tf.keras.layers.Lambda()
# by building into a model itself
SEQ_LENGTH = next(iter(dataset_flat))[0].shape[0]
model = tf.keras.Sequential([
    tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1), input_shape=[SEQ_LENGTH]),
    tf.keras.layers.LSTM(8),
    tf.keras.layers.Dense(1)
])
model.compile(loss="mae", optimizer="adam", metrics=["accuracy"])

In [109]:
model.summary()

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_25 (Lambda)           (None, 10, 1)             0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 8)                 320       
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 9         
Total params: 329
Trainable params: 329
Non-trainable params: 0
_________________________________________________________________


## Multivariate data

In [86]:
sample = tf.reshape(next(iter(dataset_flat))[0], [5, 2])

In [119]:
# take one example and get feature dimensionality
SEQ_LENGTH = sample.shape[0]
FEATURE_DIMENSIONALITY = sample.shape[1]
model = tf.keras.Sequential([
    tf.keras.layers.Lambda(lambda x: tf.reshape(x, [-1, SEQ_LENGTH, FEATURE_DIMENSIONALITY]),\
                           input_shape=[None]),
    tf.keras.layers.LSTM(8, input_shape=[SEQ_LENGTH, FEATURE_DIMENSIONALITY]),
    tf.keras.layers.Dense(1)
])
model.compile(loss="mae", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_31 (Lambda)           (None, 5, 2)              0         
_________________________________________________________________
lstm_26 (LSTM)               (None, 8)                 352       
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 9         
Total params: 361
Trainable params: 361
Non-trainable params: 0
_________________________________________________________________


## Multivariate data and not known time sequence length

In [15]:
# If you don't know the size of dataset's time dimension,
# just use None at 'input_shape' parameter
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(32, input_shape=[None, 2]),
    tf.keras.layers.Dense(1)
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 32)                4480      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 4,513
Trainable params: 4,513
Non-trainable params: 0
_________________________________________________________________


### If you don't know size of timestep - you create correct shape just before Dataset construction. At that moment you should know at least feature dimensionality and amount of separate samples 

Below is example where data is range from 0 to 99 and we assume that:
- even number come from one user
- odd number from second
- each example represented by 2 features (not crucial this time how digits are paired)

.. so we prepare dataset with correct shape to feed into LSTM.

In [27]:
import numpy as np

# Difference between array.reshape() and tf.reshape() is that
# numpy version returns you numpy array
# whereas tf version returns you tensor

# one user data
SAMPLES = 2
FEATURES = 2
DATA = range(100)

user1 = []
user2 = []

for i in DATA:
    if i % 2:
        user1.append(i)
    else:
        user2.append(i)


correct_shape_array = np.array(user1+user2).reshape([SAMPLES,-1,FEATURES])
dumb_labels = np.array([0,1]).reshape(2,1)

print(correct_shape_array.shape)
print(correct_shape_array[0,0:3,:])

(2, 25, 2)
[[ 1  3]
 [ 5  7]
 [ 9 11]]


In [28]:
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(correct_shape_array, dumb_labels)

Train on 2 samples


<tensorflow.python.keras.callbacks.History at 0x14ddc899a08>

### In order to use `tf.data.Dataset`, specify batch size (you may shuffle as well)

In [24]:
# prepare tf.data.Dataset
multivariate_dataset = tf.data.Dataset.from_tensor_slices(
                            (correct_shape_array, dumb_labels))

# in the end tf.data.Dataset has same 3D structure
# shape=(25,2) because we take only 1 example and its dimension is omitted
next(iter(multivariate_dataset))

(<tf.Tensor: shape=(25, 2), dtype=int32, numpy=
 array([[ 1,  3],
        [ 5,  7],
        [ 9, 11],
        [13, 15],
        [17, 19],
        [21, 23],
        [25, 27],
        [29, 31],
        [33, 35],
        [37, 39],
        [41, 43],
        [45, 47],
        [49, 51],
        [53, 55],
        [57, 59],
        [61, 63],
        [65, 67],
        [69, 71],
        [73, 75],
        [77, 79],
        [81, 83],
        [85, 87],
        [89, 91],
        [93, 95],
        [97, 99]])>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0])>)

In [34]:
model.fit(multivariate_dataset.shuffle(10).batch(1).prefetch(1))

Train for 2 steps


<tensorflow.python.keras.callbacks.History at 0x14de0c3ac48>

### Note about `.prefetch(1)`

From https://www.tensorflow.org/api_docs/python/tf/data/Dataset#prefetch:
```
Most dataset input pipelines should end with a call to prefetch.  
This allows later elements to be prepared while the current element  
is being processed. This often improves latency and throughput,  
at the cost of using additional memory to store prefetched elements.
```