In [110]:
import tensorflow as tf

def make_dataset():
    t = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    dataset = tf.data.Dataset.from_tensor_slices(t)
    
    return dataset

def print_batches(dataset):
    for batch in dataset:
        print('--BATCH--')
        print(batch)
        print()
        
def first_batch(dataset):
    return next(iter(dataset.take(1)))

def count_batches(dataset):
    return dataset.reduce(0, lambda x,_: x + 1).numpy()

## Create from Tensor

In [5]:
t = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])  # 1st dimension is automatically treated as batch size
dataset = tf.data.Dataset.from_tensor_slices(t)
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(3,), dtype=tf.int32, name=None)>

## Iterate

The 1st dimension is the sample dimension and the __default batch size is 1__.

A dataset is an __iterator over batches__.  A batch is a subset of the original tensor in the row direction.

In [8]:
dataset = make_dataset()
for batch in dataset:
    print(batch)

tf.Tensor([1 2 3], shape=(3,), dtype=int32)
tf.Tensor([4 5 6], shape=(3,), dtype=int32)
tf.Tensor([7 8 9], shape=(3,), dtype=int32)


## Batch Size

You can __override the default__.

If using a dataset, you __don't need to specify in model.fit()__.

In [14]:
dataset = make_dataset().batch(2)
print_batches(dataset)

--BATCH--
tf.Tensor(
[[1 2 3]
 [4 5 6]], shape=(2, 3), dtype=int32)

--BATCH--
tf.Tensor([[7 8 9]], shape=(1, 3), dtype=int32)



In [37]:
dataset = make_dataset().batch(2).batch(3)  # You can't override the batch size a 2nd time
try:
    print_batches(dataset)
except:
    print('this doesn\'t work')

this doesn't work


## Counting

You can use this to __count batches__.

In [107]:
make_dataset().reduce(0, lambda x,_: x + 1).numpy()

3

## Shuffling

Notice that the __whole thing__ was shuffled even though the batch size was 1.

In [20]:
dataset = make_dataset().shuffle(buffer_size=3, seed=42)
print_batches(dataset)

--BATCH--
tf.Tensor([1 2 3], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([7 8 9], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([4 5 6], shape=(3,), dtype=int32)



## Splitting/Slicing

The count accepted by these methods is in terms of __batches__ rather than rows.

In [23]:
dataset = make_dataset().take(2)  # Only include the first 2 batches (head)
print_batches(dataset)

--BATCH--
tf.Tensor([1 2 3], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([4 5 6], shape=(3,), dtype=int32)



In [31]:
dataset = make_dataset().take(2).shuffle(buffer_size=2, seed=42)
print_batches(dataset)  # You won't see 7, 8, 9 here because we did take(2) before shuffle

--BATCH--
tf.Tensor([4 5 6], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([1 2 3], shape=(3,), dtype=int32)



In [32]:
dataset = make_dataset().skip(1) # tail
print_batches(dataset)

--BATCH--
tf.Tensor([4 5 6], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([7 8 9], shape=(3,), dtype=int32)



In [33]:
dataset = make_dataset().shuffle(buffer_size=3, seed=42)
train,val = dataset.take(2), dataset.skip(2)
print_batches(train)

--BATCH--
tf.Tensor([1 2 3], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([7 8 9], shape=(3,), dtype=int32)



## CPU

Unlike normal tensors, datasets __default to CPU__ even if you have a GPU available.  The model will eventually move it into GPU.

In [49]:
dataset = make_dataset()
print(first_batch(dataset).device)

/job:localhost/replica:0/task:0/device:CPU:0


## Prefetch

If you were using a dataset other than tensor here (eg. CSV), you would be able to use prefetch to have batches load into memory while the model is working on the current batch.

This is normally done at the __end of the pipeline__.

In [50]:
dataset = make_dataset().prefetch(tf.data.AUTOTUNE) # Special value means automatically tune the # of batches
print_batches(dataset)

--BATCH--
tf.Tensor([1 2 3], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([4 5 6], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([7 8 9], shape=(3,), dtype=int32)



## Map

Function runs on each batch and returns a new batch, which is generally a transformed version of the old batch.

In [53]:
dataset = make_dataset().map(lambda batch: batch*2)
print_batches(dataset)

--BATCH--
tf.Tensor([2 4 6], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([ 8 10 12], shape=(3,), dtype=int32)

--BATCH--
tf.Tensor([14 16 18], shape=(3,), dtype=int32)



## Ground-Truth (Label) Values

__model.fit()__ expects dataset batches to be tuples of x,y.  This means you leave off the y argument of model.fit for this case.

In [54]:
dataset = make_dataset().map(lambda batch: (batch[:-1], batch[-1]))
print_batches(dataset)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 2], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=3>)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([4, 5], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=6>)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([7, 8], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=9>)



Tuples automatically unwrapped to __multiple args__ of function passed to map.  If the labels are present as a 2nd tuple item, you need to pass them back in the map function or they will be lost.

In [64]:
dataset = make_dataset().map(lambda batch: (batch[:-1], batch[-1]))
dataset = dataset.map(lambda batch,label: (batch,label))
print_batches(dataset)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 2], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=3>)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([4, 5], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=6>)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([7, 8], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=9>)



You may need something like this if you want to use the same transformation function for validation and test data.

In [65]:
def transform_batch(batch, label=None):
    batch = batch * 2
    if label == None:
        return batch
    else:
        return (batch, label)
    
dataset = make_dataset().map(lambda batch: (batch[:-1], batch[-1]))
dataset = dataset.map(transform_batch)

print_batches(dataset)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 4], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=3>)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([ 8, 10], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=6>)

--BATCH--
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([14, 16], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=9>)



## Tuples of Tensors

This is the manual way to create a structure like the __ground truth label__ scenario above.  The tuple remains a tuple with parallel batching inside.

In [69]:
t = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
u = tf.constant([[9, 8, 7], [6, 5, 4], [3, 2, 1]])

dataset = tf.data.Dataset.from_tensor_slices((t, u)).batch(3)

print_batches(dataset)

--BATCH--
(<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int32)>, <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[9, 8, 7],
       [6, 5, 4],
       [3, 2, 1]], dtype=int32)>)



## Lists of Tensors

Although this looks similar to the tuple case, the __behavior is different__.  Instead of getting a list of tensors in the data stream, you get a __single tensor__ where the first dimension is the one selecting between the tensors.

In [70]:
t = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
u = tf.constant([[9, 8, 7], [6, 5, 4], [3, 2, 1]])

dataset = tf.data.Dataset.from_tensor_slices([t, u]).batch(3)

print_batches(dataset)

--BATCH--
tf.Tensor(
[[[1 2 3]
  [4 5 6]
  [7 8 9]]

 [[9 8 7]
  [6 5 4]
  [3 2 1]]], shape=(2, 3, 3), dtype=int32)



## Dictionary of Tensors

Zip (instead of from_tensor_slices) accepts a dictionary of tensors and makes a dataset out of it.  This is the same format used by csv datasets.  The results will be weird if you don't make the batch sizes of the constituent datasets the same.

In [85]:
dataset = tf.data.Dataset.zip({'a': make_dataset().batch(3), 'b': make_dataset().batch(3)})
print_batches(dataset)

--BATCH--
{'a': <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int32)>, 'b': <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int32)>}



This is how to manually do what csv datasets does when a label is involved.  Strangely, it uses from_tensor_slices instead of zip.

In [87]:
dataset = tf.data.Dataset.zip(({'a': make_dataset().batch(3), 'b': make_dataset().batch(3)}, tf.data.Dataset.from_tensor_slices(tf.constant(tf.zeros(shape=3))).batch(3)))
print_batches(dataset)

--BATCH--
({'a': <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int32)>, 'b': <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int32)>}, <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0., 0., 0.], dtype=float32)>)



You can also use map to transform between dataset formats.  In this example, we go from a dictionary dataset to a tuple dataset.

In [92]:
dataset = tf.data.Dataset.zip({'a': make_dataset().batch(3), 'b': make_dataset().batch(3)})
dataset = dataset.map(lambda x: x.values())
print_batches(dataset)

--BATCH--
(<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int32)>, <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int32)>)



## Generator Dataset

Each yielded item counts as __1 sample__ and then batch size is applied to samples.  You can specify the format of the data (without batch size dimension) in output_signature.  A batch dimension is automatically added as appropriate.

In [100]:
def make_generator():
    yield tf.constant([[1, 2], [3, 4]])
    yield tf.constant([[5, 6], [7, 8]])
dataset = tf.data.Dataset.from_generator(make_generator, output_signature=tf.TensorSpec(shape=(2,2,), dtype=tf.int32)).batch(2)
print_batches(dataset)

--BATCH--
tf.Tensor(
[[[1 2]
  [3 4]]

 [[5 6]
  [7 8]]], shape=(2, 2, 2), dtype=int32)



## Dataset from CSV File

The only __required__ arguments are the __filename and batch size__.  There are a lot of other arguments to control how CSV files are interpretted.  In this case, the defaults worked well for this CSV file.  The __first line__ contains __column names__.

The data is __streamed from disk__ instead of being loaded all at once.

By default, data is __prefetched__ with __autotune__ but that can be configured.

By default, entries are __shuffled__ for each epoch.  You can configure this behavior including the seed and buffer size.

You can provide the __label_name__ parameter to extract a label column and make it a tuple dataset automatically.

In [104]:
dataset = tf.data.experimental.make_csv_dataset('titanic.csv', batch_size=1)
print(dict(first_batch(dataset)))  #  convert to dict for print purposes because ordereddict doesn't display nicely

{'PassengerId': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([230], dtype=int32)>, 'Survived': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, 'Pclass': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([3], dtype=int32)>, 'Name': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Lefebre, Miss. Mathilde'], dtype=object)>, 'Sex': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'female'], dtype=object)>, 'Age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>, 'SibSp': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([3], dtype=int32)>, 'Parch': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, 'Ticket': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'4133'], dtype=object)>, 'Fare': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([25.4667], dtype=float32)>, 'Cabin': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b''], dtype=object)>, 'Embarked': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'S'], 

The __num_epochs__ argument causes it to repeat the whole dataset transparently (as if the extra copies were in the original data).  If not specified, the default is to __repeat forever__ this way.

If you want to control the epochs from the model instead, set num_epochs=1 and then the model will __reiterate for each epoch__.

In [114]:
dataset = tf.data.experimental.make_csv_dataset('titanic.csv', batch_size=1, num_epochs=1)
print('Rows in dataset: ' + str(count_batches(dataset)))
dataset = tf.data.experimental.make_csv_dataset('titanic.csv', batch_size=1, num_epochs=2)
count_batches(dataset)

Rows in dataset: 891


1782

The results of using __.batch()__ on a csv dataset __don't make sense__.  Just use the parameter in make_csv_dataset.

In [123]:
dataset = tf.data.experimental.make_csv_dataset('titanic.csv', batch_size=2, num_epochs=1).batch(1) # Don't do this
count_batches(dataset)

446

By default, every time you iterate (__another epoch__ at the model level), the rows are __reshuffled__, which you would normally want in order to prevent order dependence in your training.

In [130]:
dataset = tf.data.experimental.make_csv_dataset('titanic.csv', batch_size=891, num_epochs=1, shuffle_seed=42)
print(first_batch(dataset)['PassengerId'][0].numpy())
print(first_batch(dataset)['PassengerId'][0].numpy())

394
370


## Dataset from Pandas Dataframe

Keep in mind this will __not stream__.  But if you're ok with loading it all into (CPU) RAM ahead of time, you get the benefit of being able to do pandas transformations.

An interesting difference from directly from CSV above is you get a bunch of scalar tensors instead of rank 1 tensors.

In [148]:
import pandas as pd
import numpy as np

df = pd.read_csv('titanic.csv').replace(np.nan, {'Embarked': '', 'Cabin': ''})  # Clean the data so TF will take it
x = df.drop(columns=['Survived']).to_dict('list')  # Get dictionary data
y = df['Survived'].tolist() # Get labels

dataset = tf.data.Dataset.from_tensor_slices((x, y))
print(first_batch(dataset))

({'PassengerId': <tf.Tensor: shape=(), dtype=int32, numpy=1>, 'Pclass': <tf.Tensor: shape=(), dtype=int32, numpy=3>, 'Name': <tf.Tensor: shape=(), dtype=string, numpy=b'Braund, Mr. Owen Harris'>, 'Sex': <tf.Tensor: shape=(), dtype=string, numpy=b'male'>, 'Age': <tf.Tensor: shape=(), dtype=float32, numpy=22.0>, 'SibSp': <tf.Tensor: shape=(), dtype=int32, numpy=1>, 'Parch': <tf.Tensor: shape=(), dtype=int32, numpy=0>, 'Ticket': <tf.Tensor: shape=(), dtype=string, numpy=b'A/5 21171'>, 'Fare': <tf.Tensor: shape=(), dtype=float32, numpy=7.25>, 'Cabin': <tf.Tensor: shape=(), dtype=string, numpy=b''>, 'Embarked': <tf.Tensor: shape=(), dtype=string, numpy=b'S'>}, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
