# Cats and dogs with linear regression

In [1]:
import numpy as np
from numpy.random import random, permutation
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D
from keras.preprocessing import image
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
from matplotlib import pyplot as plt
from PIL import Image
from sklearn.metrics import confusion_matrix
import itertools

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [3]:
vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape((1,1,3))
def vgg_preprocess(x):
    x = x - vgg_mean
    return x[:, ::-1] # reverse axis rgb->bgr

def ConvBlock(model, layers, filters):
    for i in range(layers):
        model.add(ZeroPadding2D((1, 1)))
        model.add(Conv2D(filters, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    
def FCBlock(model):
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    
def BuildVGG():
    model = Sequential()
    model.add(Lambda(vgg_preprocess, input_shape=(224,224,3)))
    ConvBlock(model, 2, 64)
    ConvBlock(model, 2, 128)
    ConvBlock(model, 3, 256)
    ConvBlock(model, 3, 512)
    ConvBlock(model, 3, 512)

    model.add(Flatten())
    FCBlock(model)
    FCBlock(model)
    model.add(Dense(1000, activation='softmax'))

    model.load_weights('/data/trained_models/vgg16_tf.h5')
    model.compile(optimizer=Adam(lr=0.001),loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [4]:
model = BuildVGG()

# Train entire model

In [17]:
# Build new model
cd_model = BuildVGG()
cd_model.pop()
cd_model.pop()
cd_model.pop()
cd_model.pop()
cd_model.pop()
cd_model.add(Dense(4096, activation='relu'))
cd_model.add(BatchNormalization())
cd_model.add(Dropout(0.3))
cd_model.add(Dense(4096, activation='relu'))
cd_model.add(BatchNormalization())
cd_model.add(Dropout(0.3))
cd_model.add(Dense(3, activation='softmax'))
cd_model.compile(optimizer=Adam(lr=0.00001),loss='categorical_crossentropy', metrics=['accuracy'])

In [18]:
# Images are of various sizes, but the smalest are 480x640

train_batches_augmented = image.ImageDataGenerator(
    rotation_range=90,
    width_shift_range=0.3,
    height_shift_range=0.3,
    shear_range=0.15,
    zoom_range=0.3,
    horizontal_flip=True,
).flow_from_directory(
    '/data/cervical/raw_data/train', 
    target_size=(224,224),
    class_mode='categorical', 
    shuffle=True, 
    batch_size=16
)

sample_valid_batches = image.ImageDataGenerator().flow_from_directory(
    '/data/cervical/raw_data/valid', 
    target_size=(224,224),
    class_mode='categorical', 
    shuffle=True, 
    batch_size=8
)

Found 1333 images belonging to 3 classes.
Found 148 images belonging to 3 classes.


In [1]:
cd_model.fit_generator(train_batches_augmented, 
                 samples_per_epoch=train_batches_augmented.samples,
                 nb_epoch=1, 
                 validation_data=sample_valid_batches, 
                 nb_val_samples=sample_valid_batches.samples
                )

NameError: name 'cd_model' is not defined

In [ ]:
# cd_model.save_weights('/data/trained_models/dog_cat_complete_retrain_model_v1.0.h5')

# Submit to kaggle

In [ ]:
test_batches = image.ImageDataGenerator().flow_from_directory(
    '/data/dogscats/test1', 
    target_size=(224,224),
    class_mode=None, 
    shuffle=False, 
    batch_size=32
)

In [ ]:
test_batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in test_batches.filenames])

In [ ]:
results = cd_model.predict_generator(test_batches, test_batches.nb_sample)

In [ ]:
def clip(x):
    if x < 0.0001:
        return 0.0001
    if x > 0.9999: 
        return 0.9999
    return x

cliped_data = [[ids[i], clip(results[i][1])] for i in range(ids.shape[0])]

In [ ]:
data = [[ids[i], 0.017 if results[i][1] < 0 else 0.983] for i in range(ids.shape[0])]

In [ ]:
data[:5]

In [ ]:
np.savetxt('/data/dogscats/test_result_rounded.csv', data ,fmt='%d,%.5f', header='id,label')

In [ ]:
from IPython.display import FileLink
FileLink('/data/dogscats/test_sorted_result.csv')

# Validating the restuls

In [ ]:
suffled_valid_batches = image.ImageDataGenerator().flow_from_directory(
    '/data/dogscats/my_photos', 
    target_size=(224,224),
    class_mode='categorical', 
    shuffle=False, 
    batch_size=8
)

In [ ]:
imgs, original_ids = next(suffled_valid_batches)
dog_cat_result = cd_model.predict(imgs)
dog_cat_label = np.vectorize(lambda x: ['cat','dog'][x])(np.argmax(dog_cat_result, axis=1))

In [ ]:
plots(imgs, titles=dog_cat_label, rows=2)

# Confusion matrix

In [ ]:
valid_batches = image.ImageDataGenerator().flow_from_directory(
    '/data/dogscats/valid', 
    target_size=(224,224),
    class_mode='categorical', 
    shuffle=False, 
    batch_size=32
)
valid_preds = cd_model.predict_generator(valid_batches, valid_batches.nb_sample)

In [ ]:
preds = np.argmax(valid_preds, axis=1)
valid_labels = valid_batches.classes
probs = valid_preds[:,1]

In [ ]:
cm = confusion_matrix(preds,valid_labels)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > cm.max() / 2 else "black")

# A few more examples

It's a good idea to see where our model is most certain about it's decisions, and where it is relatively uncertain.

In [ ]:
filenames = image.ImageDataGenerator().flow_from_directory(
    '/data/dogscats/valid', 
    target_size=(224,224),
    class_mode='categorical', 
    shuffle=False, 
).filenames

In [ ]:
def plots_idx(idx, titles=None, rows=1, figsize=(12,6)):
    plots([image.load_img('/data/dogscats/valid/' + filenames[i]) for i in idx], titles=titles, rows=rows, figsize=figsize)

## Random correctly labelled

In [ ]:
correct = np.where(preds==valid_labels)[0]
correct_idx = permutation(correct)[:8]
plots_idx(correct_idx, titles=probs[correct_idx], rows=2)

## Most confident about correct answer

In [ ]:
correct_cats = np.where((preds==0) & (preds==valid_labels))[0]
most_correct_cats = np.argsort(probs[correct_cats])[:8]
plots_idx(correct_cats[most_correct_cats], probs[correct_cats][most_correct_cats], rows=2)

In [ ]:
correct_dogs = np.where((preds==1) & (preds==valid_labels))[0]
most_correct_dogs = np.argsort(probs[correct_dogs])[::-1][:8]
plots_idx(correct_dogs[most_correct_dogs], probs[correct_dogs][most_correct_dogs], rows=2)

## All the wrong answers

In [ ]:
incorrect_cats = np.where((preds==0) & (preds!=valid_labels))[0]
most_incorrect_cats = np.argsort(probs[incorrect_cats])
plots_idx(incorrect_cats[most_incorrect_cats], probs[incorrect_cats][most_incorrect_cats], rows=4,figsize=(14,12))

In [ ]:
incorrect_dogs = np.where((preds==1) & (preds!=valid_labels))[0]
most_incorrect_dogs = np.argsort(probs[incorrect_dogs])[::-1]
plots_idx(incorrect_dogs[most_incorrect_dogs], probs[incorrect_dogs][most_incorrect_dogs],rows=4,figsize=(14,12))

## Most uncertain

In [ ]:
most_uncertain = np.argsort(np.abs(probs-0.5))
plots_idx(most_uncertain[:16], probs[most_uncertain], rows=4, figsize=(14,12))

# Look at the test set

In [ ]:
test_results = np.loadtxt('/data/dogscats/test_result_noclip.csv', delimiter=',')

In [ ]:
confident_dogs = test_results[test_results[:,1] > 0.9]
confident_cats = test_results[test_results[:,1] < 0.1]
most_uncertain = test_results[(test_results[:,1] > 0.4) & (test_results[:,1] < 0.6)]

In [ ]:
confident_cats.shape

In [ ]:
random_dogs = permutation(confident_dogs)[:16]
random_cats = permutation(confident_cats)[:16]
random_uncerain = permutation(most_uncertain)[:16]

In [ ]:
def plots_test_by_idx(idx, titles=None, rows=1, figsize=(12,6)):
    plots([image.load_img('/data/dogscats/test1/unknown/' + str(i) + '.jpg') for i in idx], titles=titles, rows=rows, figsize=figsize)

## Certain dogs

In [ ]:
plots_test_by_idx(random_dogs[:,0].astype(int), titles=random_dogs[:,1], rows=4, figsize=(14,12))

## Certain cats

In [ ]:
plots_test_by_idx(random_cats[:,0].astype(int), titles=random_cats[:,1], rows=4, figsize=(14,12))

## Uncertain

In [ ]:
plots_test_by_idx(random_uncerain[:,0].astype(int), titles=random_uncerain[:,1], rows=4, figsize=(14,12))