In [1]:
import multiprocessing
import os

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
from   tensorflow.keras.datasets import fashion_mnist as fmnist
from   tensorflow.keras.layers import (
    Conv2D as Conv, MaxPooling2D as Pool, Dense, Dropout, Flatten)
from   tensorflow.keras.models import Sequential
from   tensorflow.nn import relu, softmax

In [2]:
mnist_data = tfds.load('fashion_mnist')
for item in mnist_data:
    print(item)

train
test


In [3]:
mnist_train = tfds.load(name='fashion_mnist', split='train')

In [4]:
isinstance(mnist_train, tf.data.Dataset)

True

In [5]:
isinstance(mnist_data, tf.data.Dataset)

False

In [6]:
for item in mnist_train.take(1):
    print(type(item))
    print(item.keys())

<class 'dict'>
dict_keys(['image', 'label'])


In [7]:
for item in mnist_train.take(1):
    print(item['image'].shape)
    print(item['label'])

(28, 28, 1)
tf.Tensor(2, shape=(), dtype=int64)


In [8]:
mnist_test, info = tfds.load(name='fashion_mnist', with_info=True)
print(info)

tfds.core.DatasetInfo(
    name='fashion_mnist',
    full_name='fashion_mnist/3.0.1',
    description="""
    Fashion-MNIST is a dataset of Zalando's article images consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes.
    """,
    homepage='https://github.com/zalandoresearch/fashion-mnist',
    data_path='/Users/dsatterthwaite/tensorflow_datasets/fashion_mnist/3.0.1',
    download_size=29.45 MiB,
    dataset_size=36.42 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=tf.uint8),
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{DBLP:journals/corr/abs-1708-07747,
      author    = {Han Xiao and
                 

In [9]:
(X_train, y_train), (X_test, y_test) = fmnist.load_data()

In [10]:
(X_train, y_train), (X_test, y_test) = tfds.as_numpy(
    tfds.load('fashion_mnist', 
              split=['train', 'test'], 
              batch_size=-1, 
              as_supervised=True))

In [11]:
X_train = X_train / 255.
X_test = X_test / 255.

In [12]:
mod = Sequential([Flatten(input_shape=(28, 28, 1)),
                  Dense(128, activation=relu),
                  Dropout(0.2),
                  Dense(10, activation=softmax)])

In [13]:
mod.compile(optimizer='adam', 
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])

In [14]:
mod.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc49c702ac0>

In [15]:
data = tfds.load('horses_or_humans', split='train', as_supervised=True)
train_batches = data.shuffle(100).batch(100)

In [16]:
CKERN = (3, 3)
PKERN = (2, 2)
mod = Sequential([
    Conv(16, CKERN, activation='relu', input_shape=(300, 300, 3)),
    Pool(PKERN), 
    Conv(32, CKERN, activation='relu'),
    Pool(PKERN),
    Conv(64, CKERN, activation='relu'),
    Pool(PKERN),
    Conv(64, CKERN, activation='relu'),
    Pool(PKERN),
    Conv(64, CKERN, activation='relu'),
    Pool(PKERN),
    Flatten(),
    Dense(512, activation='relu'),
    Dense(1, activation='sigmoid')])

In [17]:
mod.compile(
    optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
hist = mod.fit(train_batches, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Adding image augmentation

In [19]:
def augment(img, label):
    img = tf.cast(img, tf.float32)
    img = img / 255
    img = tf.image.random_flip_left_right(img)
    return img, label

In [20]:
train = data.map(augment)
train_batches = train.shuffle(100).batch(100)

### Addons

In [21]:
def augment(img, label):
    img = tf.cast(img, tf.float32)
    img = img / 255
    img = tf.image.random_flip_left_right(img)
    img = tfa.image.rotate(img, 40, interpolation='NEAREST')
    return img, label

## Data Splitting

```
data = tfds.load(
    'cats_vs_dogs', split='train[:10000]', as_supervised=True)
data = tfds.load(
    'cats_vs_dogs', split='train[:20%]', as_supervised=True)
data = tfds.load('cats_vs_dogs',     
                 split='train[-1000:]+train[:1000]',
                 as_supervised=True)

train = tfds.load(
    'cats_vs_dogs', split='train[:80%], as_supervised=True)
valid = tfds.load(
    'cats_vs_dogs', split='train[80%:90%], as_supervised=True)
test = tfds.load(
    'cats_vs_dogs', split='train[-10%:], as_supervised=True)
    
train_len = [i for i, _ in enumerate(train)][-1] + 1
```

## TFRecord

In [22]:
data, info = tfds.load('mnist', with_info=True)
print(info)

tfds.core.DatasetInfo(
    name='mnist',
    full_name='mnist/3.0.1',
    description="""
    The MNIST database of handwritten digits.
    """,
    homepage='http://yann.lecun.com/exdb/mnist/',
    data_path='/Users/dsatterthwaite/tensorflow_datasets/mnist/3.0.1',
    download_size=11.06 MiB,
    dataset_size=21.00 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=tf.uint8),
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{lecun2010mnist,
      title={MNIST handwritten digit database},
      author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
      journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
      volume={2},
      year={2010}
    }""",
)


```
filename = ('/root/tensorflow_dataset/mnist/3.0.0/
            'mnist-test.tfrecord-00000-of-00001')
raw_dataset = tf.data.TFRecordDataset(filename)
for raw_record in raw_dataset.take(1):
    print(repr(raw_record))
    
>> <tf.Tensor: shape=(), dtype=string, numpy=b'\n\x85\x03...'> 
# (long binary str)
```

In [23]:
feature_description = {
    'image': tf.io.FixedLenFeature([], dtype=tf.string),
    'label': tf.io.FixedLenFeature([], dtype=tf.int64)}

In [24]:
def parse_func(example_proto):
    return tf.io.parse_single_example(example_proto, feature_description)

```
parsed_dataset = raw_dataset.map(parse_func)
for record in parsed_dataset.take(1):
    print((record))
    
>>  {'image': <tf.Tensor: shape=(), dtype=string, 
               numpy=b'\x89PNG\r\n\x1a\n...'>,
     'label': <tf.Tensor: shape=(), dtype=int64, numpy=2>}
```

### ETL in TF

In [25]:
CONV = (3, 3)
POOL = (2, 2)
mod = Sequential(
    [Conv(16, CONV, activation='relu', input_shape=(300, 300, 3)),
     Pool(POOL),
     Conv(32, CONV, activation='relu'),
     Pool(POOL),
     Conv(64, CONV, activation='relu'),
     Pool(POOL),
     Conv(64, CONV, activation='relu'),
     Pool(POOL),
     Conv(64, CONV, activation='relu'),
     Pool(POOL),
     Flatten(),
     Dense(512, activation='relu'),
     Dense(1, activation='sigmoid')])
mod.compile(
    optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
data = tfds.load('horses_or_humans', split='train', as_supervised=True)
val_data = tfds.load('horses_or_humans', split='test', as_supervised=True)

In [27]:
def augment_images(img, label):
    img = (tf.cast(img, tf.float32) / 255)
    img = tf.image.random_flip_left_right(img)
    img = tfa.image.rotate(img, 40, interpolation='NEAREST')
    return img, label

In [28]:
BATCH = 32
EPOCHS = 5

train = data.map(augment_images)
train_batches = train.shuffle(100).batch(BATCH)
valid_batches = val_data.batch(BATCH)

In [29]:
hist = mod.fit(train_batches, 
               epochs=EPOCHS, 
               validation_data=valid_batches, 
               validation_steps=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Parallelizing ETL for Improved Training Performance

In [30]:
train_data = tfds.load('cats_vs_dogs', split='train', with_info=True)

In [31]:
HOME = os.environ['HOME']
file_pattern = (f'{HOME}/tensorflow_datasets/cats_vs_dogs/4.0.0/'
                'cats_vs_dogs-train.tfrecord*')
files = tf.data.Dataset.list_files(file_pattern)

In [32]:
train_dataset = files.interleave(
    tf.data.TFRecordDataset, 
    cycle_length=4, 
    num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [33]:
def read_tfrecord(serialized_example):
    feature_desc = {'image': tf.io.FixedLenFeature((), tf.string, ''),
                    'label': tf.io.FixedLenFeature((), tf.int64, -1)}
    example = tf.io.parse_single_example(serialized_example, feature_desc)
    img = tf.io.decode_jpeg(example['image'], channels=3)
    img = tf.cast(img, tf.float32)
    img = img / 255
    img = tf.image.resize(img, (300, 300))
    return img, example['label']

In [34]:
cores = multiprocessing.cpu_count()
print(cores)

12


In [35]:
tain_dataset = train_dataset.map(read_tfrecord, num_parallel_calls=cores)
train_dataset = train_dataset.cache()

In [37]:
BUFFER = 1024
EPOCHS = 10

train_dataset = train_dataset.shuffle(BUFFER).batch(BATCH)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [39]:
mod.fit(train_dataset, epochs=EPOCHS, verbose=1)