## Training with a Larger Dataset - Cats and Dogs

In the previous lab you trained a classifier with a horses-v-humans dataset. You saw that despite getting great training results, when you tried to do classification with real images, there were many errors, due primarily to overfitting -- where the network  does very well with data that it has previously seen, but poorly with data it hasn't!

In this lab you'll look at a real, and very large dataset, and see the impact this has to avoid overfitting.

In [None]:
import os
import zipfile
import random
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt
import numpy as np
from shutil import copyfile

In [None]:
local_zip = './content/tmp/kagglecatsanddogs_5340.zip'
zip_ref   = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('./content/tmp')
zip_ref.close()


In [None]:
print(len(os.listdir('./content/tmp/PetImages/Cat/')))
print(len(os.listdir('./content/tmp/PetImages/Dog/')))

# Expected Output:
# 12501
# 12501

In [None]:
try:
    os.mkdir('./content/tmp/cats-v-dogs')
    os.mkdir('./content/tmp/cats-v-dogs/training')
    os.mkdir('./content/tmp/cats-v-dogs/testing')
    os.mkdir('./content/tmp/cats-v-dogs/training/cats')
    os.mkdir('./content/tmp/cats-v-dogs/training/dogs')
    os.mkdir('./content/tmp/cats-v-dogs/testing/cats')
    os.mkdir('./content/tmp/cats-v-dogs/testing/dogs')
except OSError:
    pass

In [None]:
def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):
    files = []
    for filename in os.listdir(SOURCE):
        file = SOURCE + filename
        if os.path.getsize(file) > 0:
            files.append(filename)
        else:
            print(filename + " is zero length, so ignoring.")

    training_length = int(len(files) * SPLIT_SIZE)
    testing_length = int(len(files) - training_length)
    shuffled_set = random.sample(files, len(files))
    training_set = shuffled_set[0:training_length]
    testing_set = shuffled_set[-testing_length:]

    for filename in training_set:
        this_file = SOURCE + filename
        destination = TRAINING + filename
        copyfile(this_file, destination)

    for filename in testing_set:
        this_file = SOURCE + filename
        destination = TESTING + filename
        copyfile(this_file, destination)


CAT_SOURCE_DIR = "./content/tmp/PetImages/Cat/"
TRAINING_CATS_DIR = "./content/tmp/cats-v-dogs/training/cats/"
TESTING_CATS_DIR = "./content/tmp/cats-v-dogs/testing/cats/"
DOG_SOURCE_DIR = "./content/tmp/PetImages/Dog/"
TRAINING_DOGS_DIR = "./content/tmp/cats-v-dogs/training/dogs/"
TESTING_DOGS_DIR = "./content/tmp/cats-v-dogs/testing/dogs/"

split_size = .9
split_data(CAT_SOURCE_DIR, TRAINING_CATS_DIR, TESTING_CATS_DIR, split_size)
split_data(DOG_SOURCE_DIR, TRAINING_DOGS_DIR, TESTING_DOGS_DIR, split_size)

# Expected output
# 666.jpg is zero length, so ignoring
# 11702.jpg is zero length, so ignoring

In [None]:
print(len(os.listdir('./content/tmp/cats-v-dogs/training/cats/')))
print(len(os.listdir('./content/tmp/cats-v-dogs/training/dogs/')))
print(len(os.listdir('./content/tmp/cats-v-dogs/testing/cats/')))
print(len(os.listdir('./content/tmp/cats-v-dogs/testing/dogs/')))

# Expected output:
# 11250
# 11250
# 1250
# 1250

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer=RMSprop(learning_rate=0.001), loss='binary_crossentropy', metrics=['acc'])


In [None]:

TRAINING_DIR = "./content/tmp/cats-v-dogs/training/"
train_datagen = ImageDataGenerator(rescale=1.0/255.)
train_generator = train_datagen.flow_from_directory(TRAINING_DIR,
                                                    batch_size=250,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

VALIDATION_DIR = "./content/tmp/cats-v-dogs/testing/"
validation_datagen = ImageDataGenerator(rescale=1.0/255.)
validation_generator = validation_datagen.flow_from_directory(VALIDATION_DIR,
                                                              batch_size=250,
                                                              class_mode='binary',
                                                              target_size=(150, 150))

# Expected Output:
# Found 22498 images belonging to 2 classes.
# Found 2500 images belonging to 2 classes.

In [None]:
# Note that this may take some time.
history = model.fit(train_generator, epochs=15, steps_per_epoch=90,
                    validation_data=validation_generator, validation_steps=6)

In [None]:
%matplotlib inline



#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['acc']
val_acc=history.history['val_acc']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.figure()


# Desired output. Charts with training and validation metrics. No crash :)

## Previsão Colab

In [None]:
# Here's a codeblock just for fun. You should be able to upload an image here
# and have it classified without crashing


# from google.colab import files

# uploaded = files.upload()

# for fn in uploaded.keys():

#   # predicting images
#   path = './content/' + fn
#   img = image.load_img(path, target_size=(150, 150))
#   x = image.img_to_array(img)
#   x = np.expand_dims(x, axis=0)

#   images = np.vstack([x])
#   classes = model.predict(images, batch_size=10)
#   print(classes[0])
#   if classes[0]>0.9:
#     print(fn + " is a dog")
#   else:
#     print(fn + " is a cat")

## Previsão Local

In [40]:

# Diretório de predição
PREDICTION_DIR = "./content/tmp/predict/"

# Função de predição
def predict_image_class(image_path, model, threshold=0.5):
    img = image.load_img(image_path, target_size=(150, 150))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    
    x = x / 255.0  # Normalização
    classes = model.predict(x)
    predicted_class = "dog" if classes[0] > threshold else "cat"
    probability = classes[0] if predicted_class == "dog" else 1 - classes[0]
    
    return predicted_class, probability

# Listar todos os arquivos de imagem no diretório
image_files = [os.path.join(PREDICTION_DIR, fn) for fn in os.listdir(PREDICTION_DIR) if fn.endswith(".jpg")]

for image_path in image_files:
    predicted_class, probability = predict_image_class(image_path, model)
    print(f"{image_path} é um {predicted_class} com probabilidade {probability[0]:.2f}")


./content/tmp/predict/dog.4014.jpg é um dog com probabilidade 0.94
./content/tmp/predict/cat.4016.jpg é um cat com probabilidade 0.87
./content/tmp/predict/dog.4015.jpg é um dog com probabilidade 0.99
./content/tmp/predict/dog.4018.jpg é um dog com probabilidade 0.98
./content/tmp/predict/cat.4011.jpg é um cat com probabilidade 0.86
./content/tmp/predict/dog.4020.jpg é um dog com probabilidade 0.92
./content/tmp/predict/cat.4002.jpg é um cat com probabilidade 0.78
./content/tmp/predict/cat.4006.jpg é um cat com probabilidade 0.85
./content/tmp/predict/cat.4009.jpg é um dog com probabilidade 0.88
./content/tmp/predict/cat.4008.jpg é um cat com probabilidade 0.93
./content/tmp/predict/cat.4015.jpg é um cat com probabilidade 1.00
./content/tmp/predict/dog.4026.jpg é um dog com probabilidade 1.00
./content/tmp/predict/cat.4005.jpg é um cat com probabilidade 0.59
./content/tmp/predict/cat.4001.jpg é um cat com probabilidade 0.94
./content/tmp/predict/cat.4014.jpg é um cat com probabilidade 