# Data in Tensorflow 2.0

Inspired by https://www.tensorflow.org/alpha/tutorials/load_data/images

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import pathlib
%matplotlib inline

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
data_root = pathlib.Path('../data/sports/')

In [None]:
data_root

In [None]:
label_names = sorted(item.name for item in data_root.glob('train/*/') if item.is_dir())
label_names

In [None]:
label_to_index = dict((name, index) for index,name in enumerate(label_names))
label_to_index

In [None]:
list(data_root.glob('train/*'))[:10]

In [None]:
all_image_paths = list(data_root.glob('train/*/*'))
all_image_paths = [str(path) for path in all_image_paths]
np.random.shuffle(all_image_paths)

image_count = len(all_image_paths)
image_count

In [None]:
all_image_paths[:5]

In [None]:
all_image_labels = [label_to_index[pathlib.Path(path).parent.name]
                    for path in all_image_paths]

## Preprocessing and Loading Functions

In [None]:
def preprocess_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [192, 192])
    image /= 255.0
    return image

def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)

def load_and_preprocess_from_path_label(path, label):
    return load_and_preprocess_image(path), label

In [None]:
ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

image_label_ds = ds.map(load_and_preprocess_from_path_label)

In [None]:
plt.figure(figsize=(8,8))
for n, (image, label) in enumerate(image_label_ds.take(4)):
    plt.subplot(2,2,n+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(image)
    plt.title(label.numpy())

In [None]:
batch_size = 32

ds = image_label_ds.shuffle(buffer_size=image_count)
ds = ds.repeat()
ds = ds.batch(batch_size)
ds = ds.prefetch(buffer_size=AUTOTUNE)
ds

In [None]:
mobile_net = tf.keras.applications.MobileNetV2(input_shape=(192, 192, 3), include_top=False)
mobile_net.trainable=False

In [None]:
def change_range(image,label):
    return 2*image-1, label

keras_ds = ds.map(change_range)

In [None]:
image_batch, label_batch = next(iter(keras_ds))

In [None]:
feature_map_batch = mobile_net(image_batch)
print(feature_map_batch.shape)

In [None]:
model = tf.keras.Sequential([
    mobile_net,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(len(label_names))])

In [None]:
logit_batch = model(image_batch).numpy()

print("min logit:", logit_batch.min())
print("max logit:", logit_batch.max())
print()

print("Shape:", logit_batch.shape)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='sparse_categorical_crossentropy',
              metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
steps_per_epoch=tf.math.ceil(len(all_image_paths)/batch_size).numpy()
steps_per_epoch

In [None]:
model.fit(ds, epochs=5, steps_per_epoch=steps_per_epoch)

## Exercise 1

[The Performance section](https://www.tensorflow.org/alpha/tutorials/load_data/images#performance) of the documentation shows a few ways to speed up fetching images.

Try to implement them and measure the speedup.

In [None]:
import time
default_timeit_steps = 2*steps_per_epoch+1

def timeit(ds, steps=default_timeit_steps):
    overall_start = time.time()
    # Fetch a single batch to prime the pipeline (fill the shuffle buffer),
    # before starting the timer
    it = iter(ds.take(steps+1))
    next(it)

    start = time.time()
    for i,(images,labels) in enumerate(it):
        if i%10 == 0:
            print('.',end='')
    print()
    end = time.time()

    duration = end-start
    print("{} batches: {} s".format(steps, duration))
    print("{:0.5f} Images/s".format(batch_size*steps/duration))
    print("Total time: {}s".format(end-overall_start))

In [None]:
ds = image_label_ds.apply(
  tf.data.experimental.shuffle_and_repeat(buffer_size=image_count))
ds = ds.batch(batch_size).prefetch(buffer_size=AUTOTUNE)
ds

In [None]:
timeit(ds)

In [None]:
ds = image_label_ds.cache()
ds = ds.apply(
  tf.data.experimental.shuffle_and_repeat(buffer_size=image_count))
ds = ds.batch(batch_size).prefetch(buffer_size=AUTOTUNE)
ds

In [None]:
timeit(ds)

In [None]:
import os

In [None]:
tfrecord_path = '../data/sports/tfrecord/'

In [None]:
os.makedirs(tfrecord_path, exist_ok=True)

In [None]:
image_ds = tf.data.Dataset.from_tensor_slices(all_image_paths).map(tf.io.read_file)
tfrec = tf.data.experimental.TFRecordWriter(tfrecord_path + 'images.tfrec')
tfrec.write(image_ds)

In [None]:
os.listdir(tfrecord_path)

In [None]:
image_ds = tf.data.TFRecordDataset(tfrecord_path + 'images.tfrec').map(preprocess_image)

In [None]:
ds = tf.data.Dataset.zip((image_ds, image_label_ds))
ds = ds.apply(
  tf.data.experimental.shuffle_and_repeat(buffer_size=image_count))
ds=ds.batch(batch_size).prefetch(AUTOTUNE)
ds

In [None]:
timeit(ds)

## Exercise 2

The [CsvDataset](https://www.tensorflow.org/guide/datasets#consuming_csv_data) class provides a way to extract records from one or more CSV files.

Use it to feed a dataset of your choice to a model.