* Data pipeline for the image ETL

In [170]:
from __future__ import absolute_import, division, print_function
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
tf.VERSION
AUTOTUNE = tf.contrib.data.AUTOTUNE

In [190]:
import pathlib
data_root = pathlib.Path('./data/train/')
print(data_root)
for item in data_root.iterdir():
  print(item)

data/train
data/train/sweetlips
data/train/barramundi
data/train/sugeonfishes
data/train/wrasse
data/train/meperors
data/train/snapper
data/train/bream
data/train/parrotfishes
data/train/cod
data/train/perches
data/train/trout


In [191]:
import random
all_image_paths = list(data_root.glob('*/*'))
all_image_paths = [str(path) for path in all_image_paths]
random.shuffle(all_image_paths)

image_count = len(all_image_paths)
image_count
all_image_paths[:10]

['data/train/barramundi/barramundi1.jpeg',
 'data/train/snapper/mutton_snapper.jpg',
 'data/train/sweetlips/Diagonal-banded Sweetlips.jpg',
 'data/train/sugeonfishes/sugeonfish1.jpeg',
 'data/train/trout/brook trout.jpg',
 'data/train/snapper/red-snapper.jpg',
 'data/train/wrasse/wrasse1.jpg',
 'data/train/barramundi/barramundi3.jpg',
 'data/train/cod/murray-cod-description.jpg',
 'data/train/barramundi/barramundi2.jpeg']

In [192]:
label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
label_names

['barramundi',
 'bream',
 'cod',
 'meperors',
 'parrotfishes',
 'perches',
 'snapper',
 'sugeonfishes',
 'sweetlips',
 'trout',
 'wrasse']

In [193]:
label_to_index = dict((name, index) for index,name in enumerate(label_names))
label_to_index

{'barramundi': 0,
 'bream': 1,
 'cod': 2,
 'meperors': 3,
 'parrotfishes': 4,
 'perches': 5,
 'snapper': 6,
 'sugeonfishes': 7,
 'sweetlips': 8,
 'trout': 9,
 'wrasse': 10}

In [194]:
all_image_labels = [label_to_index[pathlib.Path(path).parent.name]
                    for path in all_image_paths]

print("First 10 labels indices: ", all_image_labels[:6])

First 10 labels indices:  [0, 6, 8, 7, 9, 6, 10, 0, 2, 0]


In [195]:
# import IPython.display as display
# for n in range(2):
#   image_path = random.choice(all_image_paths)
#   display.display(display.Image(image_path))
# #   print(caption_image(image_path))
#   print()

The preprocessing include a resize action. Currently we are using squash method - ignore the ration and turn it to square. Please check [this article](https://forums.fast.ai/t/impact-of-image-resizing-on-model-training-time-and-performance/1980) to see the comparison amoung the different methods of resizing

In [196]:
def preprocess_image(image):
  image = tf.image.decode_jpeg(image, channels=3)
  image = tf.image.resize_images(image, [192, 192])
  image /= 255.0  # normalize to [0,1] range

  return image

In [197]:
def load_and_preprocess_image(path):
  image = tf.read_file(path)
  return preprocess_image(image)

Take a look what happen after the image processing.

In [198]:
# import matplotlib.pyplot as plt

# image_path = all_image_paths[1]
# label = all_image_labels[1]

# load_and_preprocess_image(image_path)

# plt.imshow(load_and_preprocess_image(image_path))
# plt.grid(False)
# plt.title(label_names[label].title())
# plt.show()

Start preparing the dataset

In [199]:
path_ds = tf.data.Dataset.from_tensor_slices(all_image_paths)

In [200]:
# image_ds = all_image_paths.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
# image_ds
# label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(all_image_labels, tf.int64))
# label_ds

In [201]:
# for label in label_ds.take(10):
#   print(label_names[label.numpy()])

In [202]:
# image_label_ds = tf.data.Dataset.zip((image_ds, label_ds))

In [288]:
train_images = []
train_labels = []
for image_path in all_image_paths:
    train_images.append(load_and_preprocess_image(image_path))
for label_item in tf.cast(all_image_labels, tf.int64):
    train_labels.append(label_item)
ds=tf.data.Dataset.from_tensor_slices((train_images, train_labels)).batch(32).repeat()
    

Now Make the model ready

In [289]:
# BATCH_SIZE = 32

# # Setting a shuffle buffer size as large as the dataset ensures that the data is
# # completely shuffled.
# ds = image_label_ds.shuffle(buffer_size=image_count)
# ds = ds.repeat()
# ds = ds.batch(BATCH_SIZE)
# # `prefetch` lets the dataset fetch batches, in the background while the model is training.
# ds = ds.prefetch(buffer_size=AUTOTUNE)
# ds

In [298]:
from tensorflow import keras
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(192, 192, 3)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(11, activation=tf.nn.softmax)
])


In [299]:
model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=["accuracy"])

In [300]:
model.fit(ds, epochs=1, steps_per_epoch=1)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x11fb4f4d0>

In [293]:
test_loss, test_acc = model.evaluate(ds, steps=1)
print('Test accuracy:', test_acc)

Test accuracy: 0.0


In [295]:
# test_dataset = tf.reshape(train_images[0], [1, 192, 576])
# test_dataset

# predictions = model.predict(ds, steps=1)
# # predictions
# for item in predictions:
#     print(np.argmax(item))


0
3
0
0
3
0
0
3
3
0
3
3
3
0
0
0
0


In [216]:
# print(train_labels[1])

tf.Tensor(6, shape=(), dtype=int64)
