In [26]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cats-anddogs-small-pre-vgg16/cat_and_dogs_small_pre_vgg16.h5
/kaggle/input/dogs-vs-cats-redux-kernels-edition/test.zip
/kaggle/input/dogs-vs-cats-redux-kernels-edition/train.zip
/kaggle/input/dogs-vs-cats-redux-kernels-edition/sample_submission.csv
/kaggle/input/catdogs-small-finetune/cat_and_dogs_small_finetune.h5


In [27]:
import matplotlib.pyplot as plt
import time, shutil
from keras import optimizers
from keras import layers, models
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import VGG16
from keras.applications.imagenet_utils import preprocess_input

In [28]:
original_dir_train = '/kaggle/original_data/train'
if not os.path.exists(original_dir_train):
    os.makedirs(original_dir_train)
    
# I still have to unzip `test` 

Now I unzip the training files to the folder `/kaggle/original_data/train`  

In [29]:
!unzip /kaggle/input/dogs-vs-cats-redux-kernels-edition/train.zip -d /kaggle/original_data/train

Archive:  /kaggle/input/dogs-vs-cats-redux-kernels-edition/train.zip
replace /kaggle/original_data/train/train/cat.0.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [36]:
base_dir = '/kaggle/small_data'
if not os.path.exists(base_dir):
    os.mkdir(base_dir)

Let's create the folders where we'll save the divided original training data (training, validation and test).

In [37]:
train_dir = os.path.join(base_dir, 'train')
if not os.path.exists(train_dir):
    os.mkdir(train_dir)
    
validation_dir = os.path.join(base_dir, 'validation')
if not os.path.exists(validation_dir):
    os.mkdir(validation_dir)

test_dir = os.path.join(base_dir, 'test')
if not os.path.exists(test_dir):
    os.mkdir(test_dir)

We create the folder for cats and dogs inside each of the recently created folders

In [38]:
train_cats_dir = os.path.join(train_dir, 'cats')
if not os.path.exists(train_cats_dir):
    os.mkdir(train_cats_dir)
    
train_dogs_dir = os.path.join(train_dir, 'dogs')
if not os.path.exists(train_dogs_dir):
    os.mkdir(train_dogs_dir)

validation_cats_dir = os.path.join(validation_dir, 'cats')
if not os.path.exists(validation_cats_dir):
    os.mkdir(validation_cats_dir)

validation_dogs_dir = os.path.join(validation_dir, 'dogs')
if not os.path.exists(validation_dogs_dir):
    os.mkdir(validation_dogs_dir)

test_cats_dir = os.path.join(test_dir, 'cats')
if not os.path.exists(test_cats_dir):
    os.mkdir(test_cats_dir)

test_dogs_dir = os.path.join(test_dir, 'dogs')
if not os.path.exists(test_dogs_dir):
    os.mkdir(test_dogs_dir)

We copy the pictures of cats from `train.zip` into the three cat folders


In [39]:
original_path_dataset = '/kaggle/original_data/train/train'

fnames = ['cat.{}.jpg'.format(i) for i in range(1000)]
for fname in fnames:
    src = os.path.join(original_path_dataset, fname)
    dst = os.path.join(train_cats_dir, fname)
    shutil.copyfile(src, dst)
    
fnames = ['cat.{}.jpg'.format(i) for i in range(1000, 1500)]
for fname in fnames:
    src = os.path.join(original_path_dataset, fname)
    dst = os.path.join(validation_cats_dir, fname)
    shutil.copyfile(src, dst)
    
fnames = ['cat.{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
    src = os.path.join(original_path_dataset, fname)
    dst = os.path.join(test_cats_dir, fname)
    shutil.copyfile(src, dst)

Now we copy the pictures of dogs into their folders

In [40]:
fnames = ['dog.{}.jpg'.format(i) for i in range(1000)]
for fname in fnames:
    src = os.path.join(original_path_dataset, fname)
    dst = os.path.join(train_dogs_dir, fname)
    shutil.copyfile(src, dst)

fnames = ['dog.{}.jpg'.format(i) for i in range(1000, 1500)]
for fname in fnames:
    src = os.path.join(original_path_dataset, fname)
    dst = os.path.join(validation_dogs_dir, fname)
    shutil.copyfile(src, dst)

fnames = ['dog.{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
    src = os.path.join(original_path_dataset, fname)
    dst = os.path.join(test_dogs_dir, fname)
    shutil.copyfile(src, dst)

Counting the elements in each folder:

In [41]:
print('total training cat images: {}'.format(len(os.listdir(train_cats_dir))))
print('total training dog images: {}'.format(len(os.listdir(train_dogs_dir))))
print('total validation cat images: {}'.format(len(os.listdir(validation_cats_dir))))
print('total validation dog images: {}'.format(len(os.listdir(validation_dogs_dir))))
print('total test cat images: {}'.format(len(os.listdir(test_cats_dir))))
print('total test dog images: {}'.format(len(os.listdir(test_dogs_dir))))

total training cat images: 1000
total training dog images: 1000
total validation cat images: 500
total validation dog images: 500
total test cat images: 500
total test dog images: 500


--------------------------
Now we can start building the keras model:

In [None]:
model = models.Sequential()

model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.summary()

We compile it now:

In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])

As our data has many different sizes, we create a data preprocessing step, were data is imported in batches, transformed to RGB, each pixel rescaled to a `0-1` range and resized to a `150x150 px` size:

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_generator = train_datagen.flow_from_directory(
    train_dir, 
    target_size=(150, 150), 
    batch_size=20, 
    class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
    validation_dir, 
    target_size=(150, 150), 
    batch_size=20, 
    class_mode='binary')

We have 2000 images for training and 1000 images for validation. We can check the sizes of the images and the batch:

In [None]:
# check the shape of the images and labels batch
import PIL
for data_batch, labels_batch in train_generator:
    print('data batch shape: {}'.format(data_batch.shape))
    print('labels batch shape: {}'.format(labels_batch.shape))
    break

Now we have everything prepared to train our model with the training data and validate it using the validation data we have created.

In [None]:
start = time.perf_counter()

history = model.fit_generator(
    train_generator,
    steps_per_epoch=100,
    epochs=30,
    validation_data=validation_generator,
    validation_steps=50)

elapsed = time.perf_counter() - start
print('Elapsed %.3f seconds.' % elapsed)

GPU time: 258.947 seconds

In [None]:
model.save('/kaggle/working/cat_and_dogs_small_1.h5')

Let's plot the metrics

In [None]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'r*', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r*', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

We see the model is overfitting, as the accuracy stops increasing for the validations the whereas keeps increasing for the training set. 

We'll add regularization to the model, specifically Data Augmentation and Dropout.

In [None]:
model = models.Sequential()

model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Flatten())

model.add(layers.Dropout(0.5))

model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=optimizers.RMSprop(lr=1e-4), metrics=['acc'])

Now we create the data generator with parameters for data augmentation:

In [None]:
train_datagen = ImageDataGenerator(
    rescale=1./255, 
    rotation_range=40, 
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_generator = train_datagen.flow_from_directory(
    train_dir, 
    target_size=(150, 150),
    batch_size=32,
    class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
    validation_dir, 
    target_size=(150, 150), 
    batch_size=32, 
    class_mode='binary')

Here we train the model for more epochs, as the increment in accuracy will be slower due to the regularization 

In [None]:
start = time.perf_counter()

history2 = model.fit_generator(
    train_generator,
    steps_per_epoch=100,
    epochs=100,
    validation_data=validation_generator,
    validation_steps=50)

elapsed = time.perf_counter() - start
print('Elapsed %.3f seconds.' % elapsed)

GPU time: 2615.118 seconds -> 43.5853 minutes

In [None]:
model.save('/kaggle/working/cat_and_dogs_small_augmentation.h5')

Again, we plot the metrics:

In [None]:
acc = history2.history['acc']
val_acc = history2.history['val_acc']
loss = history2.history['loss']
val_loss = history2.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'r*', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r*', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

We couldl continue adjusting model parameters such as the number of filters per convolution layer, or the number of layers in the network, but we will use another approach:

### Using a pretrained convnet

We'll use the VGG16 model [[ref]](https://arxiv.org/abs/1409.1556), trained on the `imagenet` dataset

In [None]:
conv_base = VGG16(weights='imagenet', include_top=False, input_shape=(150, 150, 3))

In [None]:
conv_base.summary()

Let's do feature extraction with data augmentation

In [None]:
model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.summary()

It is importan to freeze the convolutional base of the VGG16 model. This prevents the already trained weights to be updated during training. 

In [None]:
print(len(model.trainable_weights))

In [None]:
conv_base.trainable = False

In [None]:
print(len(model.trainable_weights))

Again, we train the model, but now with this new convolutional base:

In [None]:
train_datagen = ImageDataGenerator(
    #rescale=1./255,
    preprocessing_function=preprocess_input,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

In [42]:
# test_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

In [None]:
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(150, 150),
    batch_size=20,
    class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
    validation_dir,
    target_size=(150, 150),
    batch_size=20,
    class_mode='binary')

In [None]:
model.compile(loss='binary_crossentropy',
    optimizer=optimizers.RMSprop(lr=2e-5),
    metrics=['acc'])

In [None]:
start = time.perf_counter()

history_vgg16 = model.fit_generator(
    train_generator,
    steps_per_epoch=100,
    epochs=30,
    validation_data=validation_generator,
    validation_steps=50, 
    verbose=2)

elapsed = time.perf_counter() - start
print('Elapsed %.3f seconds.' % elapsed)

GPU time: 533.225 seconds -> 8.887 minutes

In [None]:
import keras 
keras.__version__

In [None]:
model.save('/kaggle/working/cat_and_dogs_small_pre_vgg16.h5')

Let's plot the metrics

In [None]:
acc = history_vgg16.history['acc']
val_acc = history_vgg16.history['val_acc']
loss = history_vgg16.history['loss']
val_loss = history_vgg16.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'r*', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r*', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

We obtained a validation accuracy of about 96%

To continue with the training, let's load the saved model, which was previously trained here in kaggle. 

This model consists of a convnet base (vgg16) pretrained on the imagenet dataset, plus a fully connected classiffier on top. 

We will now fine-tune this model by unfreezing the first block of layers of the vgg16 section, followed by training the whole model again. 

In [None]:
model = models.load_model('/kaggle/input/cats-anddogs-small-pre-vgg16/cat_and_dogs_small_pre_vgg16.h5')

In [None]:
model.summary()

In [None]:
model.layers

In [None]:
model.layers[0].name

In [None]:
model.layers[0].trainable = True
set_trainable = False
for layer in model.layers[0].layers:
    if layer.name == 'block5_conv1':
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False

In [None]:
model.layers[0].summary()

In [None]:
model.summary()

Let's compile this new form of the model, with the top layers of the convolutional base unfreezed. 

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-5),
              metrics=['acc'])

In [None]:
start = time.perf_counter()

history_finetune = model.fit_generator(
    train_generator,
    steps_per_epoch=100,
    epochs=100,
    validation_data=validation_generator,
    validation_steps=50)

elapsed = time.perf_counter() - start
print('Elapsed %.3f seconds.' % elapsed)

GPU time: 1873.439 seconds -> 31.223 minutes

In [None]:
acc = history_finetune.history['acc']
val_acc = history_finetune.history['val_acc']
loss = history_finetune.history['loss']
val_loss = history_finetune.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'r*', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r*', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
model.save('/kaggle/working/cat_and_dogs_small_finetune.h5')

To remove the noise from the plot and have a clear trend, we smooth it using exponential moving averages:

In [43]:
# we first load the last model, which we finetuned
model = models.load_model('/kaggle/input/catdogs-small-finetune/cat_and_dogs_small_finetune.h5')

In [44]:
def smooth_curve(points, factor=0.8):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
        return smoothed_points

In [None]:
# We can not plot this curve since we didn't save the `history` of the 
#training stage (callbacks)

plt.plot(epochs, smooth_curve(acc), 'bo', label='Smoothed training acc')
plt.plot(epochs, smooth_curve(val_acc), 'r', label='Smoothed validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, smooth_curve(loss), 'bo', label='Smoothed training loss')
plt.plot(epochs, smooth_curve(val_loss), 'r', label='Smoothed validation loss')
plt.title('Training and validation accuracy')
plt.legend()

plt.show()

However we can still evaluate the model on the test data:

In [48]:
test_generator = test_datagen.flow_from_directory(
    test_dir, 
    target_size=(150, 150),
    batch_size=20, 
    class_mode='binary')

test_loss, test_acc = model.evaluate_generator(test_generator, steps=50)
print('test acc: {}'.format(test_acc))

Found 1000 images belonging to 2 classes.
test acc: 0.968999981880188


We obtained a 97% test accuracy.

The main difference with the original competition is that we trained the model using only 2000 samples instead of the original 20000 samples. Showing us that convolutional networks are a powerfull technique for computer vision tasks. 

### todo

- evaluate the model in the whole `test` folder
- submit

In [45]:
!unzip /kaggle/input/dogs-vs-cats-redux-kernels-edition/test.zip -d /kaggle/otest

Archive:  /kaggle/input/dogs-vs-cats-redux-kernels-edition/test.zip
replace /kaggle/otest/test/1.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


New evaluation, now using the original test folder

In [74]:
original_test_dir = '/kaggle/otest/test'