## State Farm Challenge

1. Data are prepared in their directories
2. What about a way to spin up vgg16 quickly - lets actually save the model somewhere here so we can just load it directly. . .

In [1]:
%matplotlib inline
from keras.layers import Convolution2D
from keras.layers import ZeroPadding2D
from keras.layers import MaxPooling2D
from keras.models import Sequential
from keras.layers import Lambda
from keras.layers import Flatten
from keras.layers import Dense, Dropout
import numpy as np
from keras.optimizers import Adam
from keras.utils.data_utils import get_file
from keras.preprocessing import image
import re
import csv
from numpy.random import permutation
from utils.utils import plots
import glob
from os import path
import os

pname = 'state_farm_challenge'
batch_size = 64
proot = '/home/ubuntu/nbs/data/statefarm/'
# proot = '/home/ubuntu/nbs/data/statefarm/sample'
models_path = path.join(proot, 'models')
if not path.exists(models_path):
    os.makedirs(models_path)


def ConvBlock(layers, model, filters):
    # I don't know what all of these separate componenets do but together
    # they constitute a convolutional layer in a deep nn
    for i in range(layers):
        model.add(ZeroPadding2D((1, 1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    
def DenseBlock(model):
    # this is pretty standard
    # i expect that there will be a final dense output layer with dimension num_classes
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    
VGG_MEAN = np.array([123.68, 116.779, 103.939]).reshape((3, 1, 1))
def vgg_preprocess(x):
    x = x - VGG_MEAN
    return x[:, ::-1]

def _make_architecture():
    model = Sequential()
    model.add(Lambda(vgg_preprocess, input_shape=(3, 224, 224)))
    
    ConvBlock(2, model, 64)
    ConvBlock(2, model, 128)
    ConvBlock(3, model, 256)
    ConvBlock(3, model, 512)
    ConvBlock(3, model, 512)
    
    model.add(Flatten())
    DenseBlock(model)
    DenseBlock(model)
    model.add(Dense(1000, activation='softmax'))
    return model


def _initialize_model_weights(model):    
    fpath = get_file('vgg16.h5', 'http://files.fast.ai/models/vgg16.h5', cache_subdir='models')
    model.load_weights(fpath)
    
def make_vgg():
    model = _make_architecture()
    _initialize_model_weights(model)
    return model

def get_batches(
    dirname,  # eg valid or train
    gen=image.ImageDataGenerator(),  # keras utility
    shuffle=True,
    batch_size=batch_size,
    class_mode='categorical'  # no idea
):
    return gen.flow_from_directory(
    path.join(proot, dirname),
    target_size=(224, 224),  # hardcoded for our context
    class_mode=class_mode,
    shuffle=shuffle,
    batch_size=batch_size,
)

def train_model(model, batches, val_batches, epochs=1, model_tag=pname):
    # train the model over these batches (and these validation batches) for this many epochs
    # once finished nondestructively save the model to a path dictated by weights_tag parameter
    model.fit_generator(
        batches,
        samples_per_epoch=batches.nb_sample,
        nb_epoch=epochs,
        validation_data=val_batches,
        nb_val_samples=val_batches.nb_sample,    
    )
    saves = glob.glob(models_path + '%s*' % model_tag)
    num = [int(f[-f[::-1].find('_'):-3]) for f in saves]
    fname = '%s_%d.h5' % (model_tag, max(num or [0]) + 1)
    model.save_weights(path.join(models_path, fname))


def display_validation(model, val_batches, num=4):
    def plots_idx(idxs, titles, filenames):
        if len(idxs):
            plots([image.load_img(path.join(proot, 'valid', filenames[i])) for i in idxs], titles=titles)
        else:
            print('none!')

    labels = val_batches.classes
    probs = model.predict_generator(val_batches, val_batches.nb_sample)
    preds = np.argmax(probs, 1)
    filenames = val_batches.filenames

    correct = permutation(np.where(preds == labels)[0])[:num]
    incorrect = permutation(np.where(preds != labels)[0])[:num]
    
    print('correct sample')
    plots_idx(correct, preds[correct], filenames)
    print('incorrect sample')
    plots_idx(incorrect, preds[incorrect], filenames)
    
    for i in range(val_batches.nb_class):
        # dogs we got wrong
        print('missed from class %d' % i)
        miss_idxs = np.where((labels == i) & (labels != preds))[0]
        worst_idxs = np.argsort(probs[miss_idxs][:, i])[:num]
        missed_dogs_idxs = miss_idxs[worst_idxs]
        plots_idx(missed_dogs_idxs, probs[missed_dogs_idxs][:, i], filenames)
        
        # dogs we got right
        print('hits from class %d' % i)
        hit_idxs = np.where((labels == i) & (labels == preds))[0]
        best_idxs = np.argsort(probs[hit_idxs][:, i])[::-1][:num]
        hit_dogs_idxs = hit_idxs[best_idxs]
        plots_idx(hit_dogs_idxs, probs[hit_dogs_idxs][:, i], filenames)

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [11]:
def finetune(model, classes, lr=.001):
    model.pop()
    for layer in model.layers:
        layer.trainable = False
    model.add(Dense(classes, activation='softmax'))
    model.compile(optimizer=Adam(lr=lr), loss='categorical_crossentropy', metrics=['accuracy'])
    
def prepare_submission(model, clip=(.05, .95)):
    test_batches = get_batches('test', shuffle=False, batch_size=64, class_mode=None)
    predictions = model.predict_generator(test_batches, test_batches.nb_sample)
    clipped = np.clip(predictions, .05, .95)
    f = open(path.join(proot, pname + '_submission.csv'), 'w')
    writer = csv.Writer(f)
    writer.writerow(['img', 'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'])
    for filename, ps in itertools.izip(test_batches.filenames, clipped_predictions):
        writer.writerow([filename] + list(ps))

In [3]:
model = make_vgg()
finetune(model, 10)
train_batches = get_batches('train')
valid_batches = get_batches('valid', shuffle=False)
train_model(model, train_batches, valid_batches, epochs=5)

Found 20187 images belonging to 10 classes.
Found 2237 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
display_validation(model, valid_batches)

In [6]:
test_batches = get_batches('test', shuffle=False, batch_size=64, class_mode=None)
predictions = model.predict_generator(test_batches, test_batches.nb_sample)

Found 79726 images belonging to 1 classes.


In [19]:
import itertools
clipped = np.clip(predictions, .05, .95)
with open(path.join(proot, pname + '_submission.csv'), 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['img', 'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'])
    for filename, ps in itertools.izip(test_batches.filenames, clipped):
        writer.writerow([filename] + list(ps))

In [18]:
predictions[-1]

array([  6.4168e-01,   1.1886e-04,   2.2242e-04,   5.4664e-04,   3.5171e-01,   6.1181e-06,
         3.2724e-03,   5.1362e-05,   6.4707e-04,   1.7532e-03], dtype=float32)

In [10]:
predictions.shape
['img'] + list(predictions[0])

['img',
 0.34027249,
 0.00084958336,
 2.8812028e-05,
 0.00078424788,
 0.021131529,
 0.082023025,
 0.24896514,
 0.0010023912,
 0.063092291,
 0.24185042]