In [None]:
import os
import pathlib
import PIL
import PIL.Image
import matplotlib.pyplot as plt
import numpy as np

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

print("TF Version: ", tf.__version__)
print("TF Eager mode: ", tf.executing_eagerly())
print("TF GPU is", "available" if tf.config.list_physical_devices("GPU") else "not available")

## Create dataset

In [None]:
# Download dataset
url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
arc = tf.keras.utils.get_file(origin=url, extract=True)
data_dir = pathlib.Path(arc) / "flower_photos"

### Create dataset by utils

Creating dataset by loading a bunch of files from a disk using `tf.keras.utils.image_dataset_from_directory` function. The structure of source dir should have particular structure.

In [63]:
# Create training and validation datasets
train_ds, val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  seed=1,
  subset="both",
  image_size=(180, 180),
  batch_size=32)

Found 3670 files belonging to 5 classes.
Using 2936 files for training.
Using 734 files for validation.


In [64]:
# List available classes
class_names = train_ds.class_names
print(f"Class names: {class_names}")

Class names: ['daisy', 'dandelion', 'roses', 'sunflowers', 'tulips']


In [None]:
# Print a shape of one image batch
for image_batch, labels_batch in train_ds:
  print(image_batch.shape)
  print(labels_batch.shape)
  break

In [None]:
# Show several samples
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
      ax = plt.subplot(3, 3, i + 1)
      plt.imshow(images[i].numpy().astype("uint8"))
      plt.title(class_names[labels[i]])
      plt.axis("off")

In [None]:
train_ds = (train_ds
  .cache()
  .prefetch(buffer_size=tf.data.AUTOTUNE))

val_ds = (val_ds
  .cache()
  .prefetch(buffer_size=tf.data.AUTOTUNE))

### Create dataset by TFDS

Creating a dataset object by loading dataset from a catalog. Particular split policy is applied to split a dataset into subsets.

In [None]:
(tfds_train_ds, tfds_val_ds, tfds_test_ds), metadata = tfds.load(
    'tf_flowers',
    split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
    with_info=True,
    as_supervised=True,
)

In [None]:
num_classes = metadata.features['label'].num_classes
print(num_classes)

In [None]:
tfds_train_ds = (tfds_train_ds
    .cache()
    .shuffle(buffer_size=1000)
    .batch(32)
    .prefetch(buffer_size=tf.data.AUTOTUNE))

tfds_val_ds = (tfds_val_ds
    .cache()
    .batch(32)
    .prefetch(buffer_size=tf.data.AUTOTUNE))

tfds_test_ds = (tfds_test_ds
    .cache()
    .batch(32)
    .prefetch(buffer_size=tf.data.AUTOTUNE))

### Create dataset manually

Creating dataset by applying steps manually in order to get full control.

In [None]:
list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*'), shuffle=False)
list_ds = list_ds.shuffle(10_000, reshuffle_each_iteration=False)

In [None]:
image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

In [None]:
val_size = int(image_count * 0.2)
manual_train_ds = list_ds.skip(val_size)
manual_val_ds = list_ds.take(val_size)

In [None]:
class_names = np.array(sorted(
    [item.name
     for item in data_dir.glob('*')
     if item.name != "LICENSE.txt"]
))
print(f"Class names: {class_names}")

In [None]:
def get_label(file_path):
  # Convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  one_hot = parts[-2] == class_names
  # Integer encode the label
  return tf.argmax(one_hot)

In [None]:
def decode_img(img):
  # Convert the compressed string to a 3D uint8 tensor
  img = tf.io.decode_jpeg(img, channels=3)
  # Resize the image to the desired size
  return tf.image.resize(img, [180, 180])

In [None]:
def process_path(file_path):
  label = get_label(file_path)
  # Load the raw data from the file as a string
  img = tf.io.read_file(file_path)
  img = decode_img(img)
  return img, label

In [None]:
manual_train_ds = manual_train_ds.map(
    process_path,
    num_parallel_calls=tf.data.AUTOTUNE)

manual_val_ds = manual_val_ds.map(
    process_path,
    num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
manual_train_ds = (manual_train_ds
    .cache()
    .shuffle(buffer_size=1000)
    .batch(32)
    .prefetch(buffer_size=tf.data.AUTOTUNE))

manual_val_ds = (manual_val_ds
    .cache()
    .batch(32)
    .prefetch(buffer_size=tf.data.AUTOTUNE))

## Standardize

### Rescaling

In [None]:
normalization_layer = tf.keras.layers.Rescaling(1./255)

# Option 1 - Applying re-scaling to the dataset by calling Dataset.map
normalized_train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))

# Option 2 - Add `normalization_layer` inside model
# model = tf.keras.models.Sequential([
#    tf.keras.Input(shape=(180, 180, 1)),
#    tf.keras.layers.Rescaling(1.0 / 255.0),
# ])

### Resizing

In [None]:
resizing_layer = tf.keras.layers.Resizing(height=96, width=96)

# Option 1 - Applying re-sizing to the dataset by calling Dataset.map
normalized_train_ds = train_ds.map(lambda x, y: (resizing_layer(x), y))

# Option 2 - Add `normalization_layer` inside model
# model = tf.keras.models.Sequential([
#    tf.keras.Input(shape=(180, 180, 1)),
#    tf.keras.layers.Resizing(height=96, width=96),
# ])