## State Farm Challenge

1. Data are prepared in their directories
2. What about a way to spin up vgg16 quickly - lets actually save the model somewhere here so we can just load it directly. . .

In [64]:
%matplotlib inline
from keras.layers import Convolution2D
from keras.layers import ZeroPadding2D
from keras.layers import MaxPooling2D
from keras.models import Sequential
from keras.layers import Lambda
from keras.layers import Flatten
from keras.layers import Dense, Dropout
import numpy as np
from keras.optimizers import Adam
from keras.utils.data_utils import get_file
from keras.preprocessing import image
import re
import csv
from numpy.random import permutation
from utils.utils import plots
from utils import utils
import glob
from os import path
import os
from fast_ai.vgg_setup import make_vgg
import itertools

pname = 'state_farm_challenge'
batch_size = 64
proot = '/home/ubuntu/nbs/data/statefarm/'
# proot = '/home/ubuntu/nbs/data/statefarm/sample'
models_path = path.join(proot, 'models')
if not path.exists(models_path):
    os.makedirs(models_path)

    
def do_predict(proot, model):
    # load cached test data from disk and predict
    # return filenames and predictions
    batches = utils.get_batches(path.join(proot, 'test'), shuffle=False)
    filenames = batches.filenames
    preds = model.predict_generator(batches, batches.nb_sample)
    return filenames, preds
    
    
def get_next_filenum(base, tag):
    matches = glob.glob(path.join(base, tag + '*'))
    trailing_underscore = lambda f: -f[::-1].find('_')
    trailing_dot = lambda f: -f[::-1].find('.')
    m = 0
    for f in matches:
        trailing_underscore = -f[::-1].find('_')
        trailing_dot = -f[::-1].find('.') - 1
        thint = f[trailing_underscore:trailing_dot]
        m = max(m, int(thint))
    return m + 1
    
def train_model(model, batches, val_batches, epochs=1, model_tag=pname):
    # train the model over these batches (and these validation batches) for this many epochs
    # once finished nondestructively save the model to a path dictated by weights_tag parameter
    model.fit_generator(
        batches,
        samples_per_epoch=batches.nb_sample,
        nb_epoch=epochs,
        validation_data=val_batches,
        nb_val_samples=val_batches.nb_sample,    
    )
    num = get_next_filenum(models_path, model_tag)
    fname = '%s_%d.h5' % (model_tag, num)
    model.save_weights(path.join(models_path, fname))


def display_validation(model, val_batches, num=4):
    def plots_idx(idxs, titles, filenames):
        if len(idxs):
            plots([image.load_img(path.join(proot, 'valid', filenames[i])) for i in idxs], titles=titles)
        else:
            print('none!')

    labels = val_batches.classes
    probs = model.predict_generator(val_batches, val_batches.nb_sample)
    preds = np.argmax(probs, 1)
    filenames = val_batches.filenames

    correct = permutation(np.where(preds == labels)[0])[:num]
    incorrect = permutation(np.where(preds != labels)[0])[:num]
    
    print('correct sample')
    plots_idx(correct, preds[correct], filenames)
    print('incorrect sample')
    plots_idx(incorrect, preds[incorrect], filenames)
    
    for i in range(val_batches.nb_class):
        # dogs we got wrong
        print('missed from class %d' % i)
        miss_idxs = np.where((labels == i) & (labels != preds))[0]
        worst_idxs = np.argsort(probs[miss_idxs][:, i])[:num]
        missed_dogs_idxs = miss_idxs[worst_idxs]
        plots_idx(missed_dogs_idxs, probs[missed_dogs_idxs][:, i], filenames)
        
        # dogs we got right
        print('hits from class %d' % i)
        hit_idxs = np.where((labels == i) & (labels == preds))[0]
        best_idxs = np.argsort(probs[hit_idxs][:, i])[::-1][:num]
        hit_dogs_idxs = hit_idxs[best_idxs]
        plots_idx(hit_dogs_idxs, probs[hit_dogs_idxs][:, i], filenames)

In [60]:
def finetune(model, classes, lr=.001):
    model.pop()
    for layer in model.layers:
        layer.trainable = False
    model.add(Dense(classes, activation='softmax'))
    model.compile(optimizer=Adam(lr=lr), loss='categorical_crossentropy', metrics=['accuracy'])
    
def prepare_submission(proot, pname, filenames, predictions, clip=(.05, .95)):
    clipped = np.clip(predictions, clip[0], clip[1])
    num = get_next_filenum(proot, pname + '_submission')
    fname = '%s_submission_%d.csv' % (pname, num)
    with open(path.join(proot, fname), 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['img', 'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'])
        for filename, ps in itertools.izip(filenames, clipped):
            writer.writerow([filename[filename.find('/') + 1:]] + list(ps))
    

In [61]:
num_epochs = 3
model = make_vgg()
finetune(model, 10)
train_batches = utils.get_batches(path.join(proot, 'train'))
valid_batches = utils.get_batches(path.join(proot, 'valid'), shuffle=False)
for _ in range(num_epochs):
    train_model(model, train_batches, valid_batches, epochs=1)

Found 100 images belonging to 10 classes.
Found 100 images belonging to 10 classes.
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [65]:
model = make_vgg()
finetune(model, 10)
model.load_weights(path.join(models_path, pname + '_3.h5'))

#for _ in range(2):
#    train_model(remodel, train_batches, valid_batches, epochs=1)

In [None]:
filenames, predictions = do_predict(proot, model)

Found 79726 images belonging to 1 classes.


In [63]:
prepare_submission(proot, pname, filenames, predictions, clip=(0, 1))