# Image classification: dogs & cats

In [None]:
# Handle imports up-front
import glob
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from keras.preprocessing import image


## 1. Data preparation

### 1.1. Load the data paths

In [None]:
# Set the path to the training data
training_data_path='../data/processed/train'

# Get a list of training dog and cat images
training_dogs=glob.glob(f'{training_data_path}/dog/dog.*')
training_cats=glob.glob(f'{training_data_path}/cat/cat.*')

### 1.2. Inspect

In [None]:
fig, axs = plt.subplots(3,2,figsize=(6, 4))

for cat, dog, row in zip(training_cats, training_dogs, axs):
    for animal, ax in zip([cat, dog], row):
        animal=image.load_img(animal)
        animal=image.img_to_array(animal)
        animal/=255.0
        ax.imshow(animal)
        ax.axis('off')

plt.tight_layout()
plt.show()

## 2. EDA

Let's take a look at a few of our images to get a feel for how image data is structured.

### 2.1. Image data

In [None]:
# Load one of the images as an array and look at it's shape - what do you see, what are the dimensions? Are they what you expect?

In [None]:
# Plot a histogram of the three 2D arrays which comprise the image. What do you see?

### 2.2. Image dimensions

Let's take a look at a random sample of images from the dataset and see what their dimensions are.

In [None]:
# Loop over a few hundred images and extract their width and heigh, plot both as a histogram. What do you see, does this information matter to us, if so, how?

### 2.3. Image aspect ratios

In [None]:
# Plot a histogram of the image aspect ratios (i.e. width/height) What do you see, does this information matter to us, if so, how?

## 3. Build the model

### 3.1. Prepare images for streaming

In [None]:
def make_datasets(training_data_path: str, image_dim: int, batch_size: int=16):

    training_dataset=tf.keras.utils.image_dataset_from_directory(
        training_data_path,
        validation_split=0.2,
        subset='training',
        seed=315,
        image_size=(image_dim, image_dim),
        batch_size=batch_size
    ).repeat()

    validation_dataset=tf.keras.utils.image_dataset_from_directory(
        training_data_path,
        validation_split=0.2,
        subset='validation',
        seed=315,
        image_size=(image_dim, image_dim),
        batch_size=batch_size
    ).repeat()

    AUTOTUNE=tf.data.AUTOTUNE

    training_dataset=training_dataset.cache().shuffle(256, reshuffle_each_iteration=True).prefetch(buffer_size=AUTOTUNE)
    validation_dataset=training_dataset.cache().shuffle(256, reshuffle_each_iteration=True).prefetch(buffer_size=AUTOTUNE)

    return training_dataset, validation_dataset

training_dataset, validation_dataset=make_datasets(training_data_path, 128)

### 3.1. Model definition

In [None]:
def compile_model(image_dim, learning_rate):

    initializer=tf.keras.initializers.GlorotUniform(seed=315)

    model=Sequential([
        layers.Input((image_dim, image_dim, 3)),
        layers.Rescaling(1./255),
        layers.Conv2D(16, 3, padding='same', activation='relu', kernel_initializer=initializer),
        layers.MaxPooling2D(),
        layers.Conv2D(32, 3, padding='same', activation='relu', kernel_initializer=initializer),
        layers.MaxPooling2D(),
        layers.Conv2D(64, 3, padding='same', activation='relu', kernel_initializer=initializer),
        layers.MaxPooling2D(),
        layers.Flatten(),
        layers.Dense(128, activation='relu', kernel_initializer=initializer),
        layers.Dense(1, activation='sigmoid', kernel_initializer=initializer)
    ])

    optimizer=keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['binary_accuracy'])

    return model

model=compile_model(128, 0.001)
model.summary()

### 3.2. Model training

In [None]:
%%time

training_results=model.fit(
  training_dataset,
  validation_data=validation_dataset,
  epochs=10,
  steps_per_epoch=5,
  validation_steps=5
)

print()

In [None]:
# Take a look at what information 'training_results' contains. Plot the training and validation accuracy (and binary cross-entropy if you like) over the training epoch. Is the model learning? If not, what do you think is wrong?
    
# Set-up a 1x2 figure for accuracy and binary cross-entropy
fig, axs=plt.subplots(1,2, figsize=(8,4))

# Add the main title
fig.suptitle('CNN training curves', size='large')

# Plot training and validation accuracy
axs[0].set_title('Accuracy')
axs[0].plot(np.array(training_results.history['binary_accuracy']) * 100, label='Training')
axs[0].plot(np.array(training_results.history['val_binary_accuracy']) * 100, label='Validation')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('Accuracy (%)')
axs[0].legend(loc='upper left')

# Plot training and validation binary cross-entropy
axs[1].set_title('Binary cross-entropy')
axs[1].plot(training_results.history['loss'])
axs[1].plot(training_results.history['val_loss'])
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('Binary cross-entropy')

# Show the plot
fig.tight_layout()

### 3.3. Model optimization

In [None]:
# Try optimizing the learning rate and the batch size using a few values near the default settings. Hint: use a loop!

In [None]:
# Pick the best values for learning rate and batch size and train the model for longer

## 4. Evaluate the model

In [None]:
# Test the model out on the test data - is it as good as you expected, given the training data? Worse? Better? Why?