In [1]:
import os
_origlistdir = os.listdir

In [5]:
target_subject='p024'

In [None]:
%%bash
split_name='sample'

rm -rf splits/$split_name
mkdir -p splits/$split_name/train
for obj_type in `ls train`; do
  mkdir -p splits/$split_name/train/$obj_type
  mkdir -p splits/$split_name/valid/$obj_type

  #echo "Preparing training set for class ${obj_type}"
  ls train/$obj_type | tail -n150 | xargs -I{} cp train/$obj_type/{} splits/$split_name/train/$obj_type
  #echo "Preparing validation set for class ${obj_type}"
  ls train/$obj_type | tail -n200 | head -n50 | xargs -I{} cp train/$obj_type/{} splits/$split_name/valid/$obj_type
done

In [37]:
from keras.preprocessing.image import ImageDataGenerator
batch_size = 32

def setup_filter_to_exclude(notallowed_filenames):
    def listdir(path):
        res = _origlistdir(path)
        res = list(filter(lambda fname: fname not in notallowed_filenames, res))
        return res
    os.listdir = listdir

def clear_filter():
    os.listdir = _origlistdir

def data_for(validation_subject_id, shuffle=False, verbose=False, augment=False,
             train_dir='train',
             validation_dir='train'):
    generator = ImageDataGenerator()
    if (augment):
        print("Using augmenting generator")
        generator = image.ImageDataGenerator(rotation_range=12, width_shift_range=0.1, height_shift_range=0.025, 
                                 shear_range=0.15, zoom_range=0.1, channel_shift_range=20, fill_mode='constant')

    train_fnames = set(drivers[drivers['subject'] != validation_subject_id]['img'])
    validation_fnames = set(drivers[drivers['subject'] == validation_subject_id]['img'])

    # train
    setup_filter_to_exclude(validation_fnames)
    train_generator = generator.flow_from_directory(train_dir,
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=shuffle)

    # validation
    setup_filter_to_exclude(train_fnames)
    validation_generator = generator.flow_from_directory(validation_dir,
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=shuffle)

    if (len(np.unique(train_generator.classes)) != len(np.unique(validation_generator.classes))):
        raise Exception("Not all the classes are represented in the sample")
    
    clear_filter()

    return train_generator, validation_generator

In [29]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.image import load_img, img_to_array
from utils import *

In [38]:
import pandas as pd
drivers = pd.DataFrame.from_csv('driver_imgs_list.csv', index_col=None)

In [31]:
from keras.models import Sequential
from keras.layers import BatchNormalization, MaxPooling2D, Convolution2D, Flatten, Dense

In [32]:
model = Sequential([
        BatchNormalization(axis=1, input_shape=(3,224,224)),
        Convolution2D(32,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Convolution2D(64,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Flatten(),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ])

In [33]:
model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
model.optimizer.lr = 0.001

In [34]:
train_batches, validation_batches = data_for(target_subject, augment=True, shuffle=True,
                                             train_dir='splits/sample/train',
                                             validation_dir='splits/sample/valid')

Using augmenting generator
Found 1417 images belonging to 10 classes.
Found 31 images belonging to 10 classes.


In [35]:
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch=5, 
                        validation_data=validation_batches, nb_val_samples=validation_batches.nb_sample, verbose=1)

INFO (theano.gof.compilelock): Refreshing lock /home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/lock_dir/lock


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f046d5db250>

In [36]:
train_batches, validation_batches = data_for(target_subject, augment=True, shuffle=True)
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch=5, 
                        validation_data=validation_batches, nb_val_samples=validation_batches.nb_sample, verbose=1)

Using augmenting generator
Found 1417 images belonging to 10 classes.
Found 31 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0460ba5fd0>

In [39]:
train_batches, validation_batches = data_for(target_subject, augment=True, shuffle=True)
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch=5, 
                        validation_data=validation_batches, nb_val_samples=validation_batches.nb_sample, verbose=1)

Using augmenting generator
Found 21198 images belonging to 10 classes.
Found 1226 images belonging to 10 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0460ba9610>

In [40]:
import pandas as pd
from IPython.display import FileLink
def predict_to_csv(model, filename='submission.csv'):
    generator = ImageDataGenerator()    
    test_generator = generator.flow_from_directory(
        'test',
        target_size=(224, 224),
        batch_size=batch_size,
        shuffle=False,
        class_mode=None)
    
    predictions = model.predict_generator(test_generator, test_generator.nb_sample)
    filenames = map(lambda name: name.split('/')[1], test_generator.filenames)

    df = pd.DataFrame(predictions,index=filenames)
    df.columns = map(lambda (a,b): b + str(a), zip(range(0,10), ['c'] * 10))

    df.to_csv(filename, index_label='img')

    return FileLink(filename)

In [41]:
# this gets to 1.24418 on kaggle = 597 / 1440 --> top 50% achieved
%time predict_to_csv(model, 'submission_batchnorm_freestyle.csv')

Found 79726 images belonging to 1 classes.
CPU times: user 19min 22s, sys: 2min 15s, total: 21min 37s
Wall time: 19min 37s


In [42]:
train_batches, validation_batches = data_for('p002', augment=True, shuffle=True)

Using augmenting generator
Found 21699 images belonging to 10 classes.
Found 725 images belonging to 10 classes.


In [43]:
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch=1, 
                        validation_data=validation_batches, nb_val_samples=validation_batches.nb_sample, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7f0456448310>

In [44]:
# this gets to 1.13414 = 569 / 1440 --> top 50% achieved
%time predict_to_csv(model, 'submission_batchnorm_freestyle_trained_on_another_too.csv')

Found 79726 images belonging to 1 classes.
CPU times: user 19min 9s, sys: 2min 15s, total: 21min 25s
Wall time: 17min 42s
