In [1]:
"""
# https://www.kaggle.com/c/state-farm-distracted-driver-detection/data
# download data and then create validation from train

import os
import random

path = 'data/distracted_driver/imgs/'

if not os.path.isdir(path + 'val'): # make validation folder
    print('Making new directory: {}'.format(path + 'val'))
    os.mkdir(path + 'val')


# assumes there's a folder called 'train' which has all the labeled dataset
for driver_class in os.listdir(path + 'train/'): # move 20% of train to validation
    images = os.listdir(path + 'train/' + driver_class)
    val_list = random.sample(images, len(images) // 5)
    
    if not os.path.isdir(path + 'val/' + driver_class):
        print('Making new directory: {}'.format(path + 'val/' + driver_class))
        os.mkdir(path + 'val/' + driver_class)
    for image in val_list:
        os.rename(path + 'train/' + driver_class + '/' + image, path + 'val/' + driver_class + '/' + image)

# assumes there's a folder called test and will put all unlabeled dataset in test/unlabeled
if not os.path.isdir(path + 'test/unlabeled'): # moving test set into a folder of its own
    os.rename(path + 'test/', path + 'unlabeled/')
    os.mkdir(path + 'test/')
    os.rename(path + 'unlabeled/', path + 'test/unlabeled/')
""";

Making new directory: data/distracted_driver/imgs/val
Making new directory: data/distracted_driver/imgs/val/c7
Making new directory: data/distracted_driver/imgs/val/c8
Making new directory: data/distracted_driver/imgs/val/c6
Making new directory: data/distracted_driver/imgs/val/c0
Making new directory: data/distracted_driver/imgs/val/c2
Making new directory: data/distracted_driver/imgs/val/c4
Making new directory: data/distracted_driver/imgs/val/c9
Making new directory: data/distracted_driver/imgs/val/c1
Making new directory: data/distracted_driver/imgs/val/c3
Making new directory: data/distracted_driver/imgs/val/c5


In [1]:
from vgg16 import Vgg16
import os

path = 'data/distracted_driver/imgs/' # change for respective data set
model_path = 'data/distracted_driver/models/' # change for respective data set
batch_size = 64

if not os.path.isdir(model_path): # make validation folder
    print('Making new directory: {}'.format(model_path))
    os.mkdir(model_path)

vgg = Vgg16() # imagenet weights already loaded
# during training, shuffle should be set to true. Or else, it will likely receive training on the same class consecutively
train_batches = vgg.get_batches(path + 'train/', batch_size=batch_size, shuffle=True)
val_batches = vgg.get_batches(path + 'val/', batch_size=batch_size * 2, shuffle=False)
vgg.finetune(train_batches) # optimizer is Adam

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


Found 17943 images belonging to 10 classes.
Found 4481 images belonging to 10 classes.


In [2]:
%%time
try:
    vgg.model.load_weights(model_path + 'complete_model_weights.h5')
    vgg.model.evaluate_generator(val_batches, val_batches.nb_sample)
except IOError:
    vgg.fit(train_batches, val_batches, nb_epoch=1)  # also can use val as train set for faster training, shuffle=True
    vgg.model.save_weights(model_path + 'complete_model_weights.h5')

CPU times: user 2min 47s, sys: 27.4 s, total: 3min 14s
Wall time: 2min 1s


In [3]:
%%time

import bcolz
from keras.layers.core import Dense, Dropout
from keras.models import Sequential


def save_array(fname, arr): 
    c = bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
    
def load_array(fname): 
    return bcolz.open(fname)[:]

def split_model(model, layer_type=None):
    """
    Split model at first instance of layer_type.
    Note: could not perform deepcopy of model
    """
    if layer_type is None:
        layer_type = Dense
    first_dense_idx = [index for index, layer in enumerate(model.layers) if 
                       type(layer) is layer_type][0]    
    return model.layers[:first_dense_idx], model.layers[first_dense_idx:]

conv_layers, dense_layers = split_model(vgg.model)
conv_model = Sequential(conv_layers)
train_batches = vgg.get_batches(path + 'train/', batch_size=batch_size, shuffle=False)
val_batches = vgg.get_batches(path + 'val/', batch_size=batch_size, shuffle=False)
# have to turn shuffle off to computing features; once shuffled, cannot undo it

# this is possible since these features are much smaller than 
# raw images and are capable of being loaded into memory
try: 
    trn_features = load_array(model_path + 'train_conv_features.bc/')
    val_features = load_array(model_path + 'val_conv_features.bc/')
except IOError:
    trn_features = conv_model.predict_generator(
        train_batches, val_samples=train_batches.nb_sample)
    val_features = conv_model.predict_generator(
        val_batches, val_samples=val_batches.nb_sample)

    save_array(model_path + 'train_conv_features.bc/', trn_features)
    save_array(model_path + 'val_conv_features.bc/', val_features)
    
print(trn_features.shape, val_features.shape)

Found 17943 images belonging to 10 classes.
Found 4481 images belonging to 10 classes.
((17943, 25088), (4481, 25088))
CPU times: user 12min 59s, sys: 2min 15s, total: 15min 15s
Wall time: 9min 51s


In [4]:
# cannot modify layers after they have been created

from keras.layers.core import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
import numpy as np
from sklearn.preprocessing import OneHotEncoder

def get_fc_model_2(conv_layers, dense_layers, opt=None, new_dropout_p=None):
    
    def proc_wgts(layer): # don't know how to reflate the weights if given
        # different p for different layers
        return [o / 2 for o in layer.get_weights()]
    
    def make_new_layer(layer, new_dropout_p):
        if type(layer) is Dense:
            return Dense(output_dim=layer.output_dim, input_shape=
                         layer.input_shape[1:], activation=layer.activation)
        # notice the input_shape is a slice, also takes care of number of 
        # output nodes and softmax
        elif type(layer) is Dropout:
            return Dropout(p=new_dropout_p, input_shape=layer.input_shape[1:])
        else:
            raise Exception('Unexpected layer')
    
    if new_dropout_p is None:
        new_dropout_p = 0.0
    model = Sequential([make_new_layer(layer, new_dropout_p) 
                        for layer in dense_layers])

    for l1,l2 in zip(model.layers, dense_layers): 
        l1.set_weights(proc_wgts(l2))
        
    if opt is None:
        opt = Adam(lr=0.00001) # need small learning rate or else 
        # loss will increase and accuracy will decrease
    model.compile(optimizer=opt, loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

def onehot(x): 
    return np.array(OneHotEncoder().fit_transform(x.reshape(-1, 1)).todense())


trn_labels = onehot(train_batches.classes)
val_labels = onehot(val_batches.classes)

dense_model = get_fc_model_2(conv_layers, dense_layers)
print(dense_model.evaluate(trn_features, trn_labels))
print(dense_model.evaluate(val_features, val_labels))
# it appears that splitting the model up and evaluate will give slightly different results than
# original VGG model

[1.6997923872738334, 0.86323357294456604]


In [5]:
dense_model.fit(trn_features, trn_labels, nb_epoch=2, 
             batch_size=batch_size, validation_data=(val_features, val_labels))
# dense_model.optimizer.lr.get_value(), dense_model.optimizer.lr.set_value()
# appears without loading weights from 1 epoch of training, training with small learning rate still works well

Train on 17943 samples, validate on 4481 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f7418bd8f50>

In [12]:
import itertools

list((element, len(list(_))) for (element, _) in itertools.groupby(np.argmax(val_labels, axis=1)))

[(0, 497),
 (1, 453),
 (2, 463),
 (3, 469),
 (4, 465),
 (5, 462),
 (6, 465),
 (7, 400),
 (8, 382),
 (9, 425)]

In [64]:
train_batches = vgg.get_batches(path + 'val/', batch_size=batch_size, shuffle=True)
train_batches.shuffle = False

Found 4481 images belonging to 10 classes.


In [65]:
for i, _ in enumerate(train_batches):
    if i > train_batches.N / train_batches.batch_size + 1:
        break
print np.argmax(next(train_batches)[1], axis=1)

IOError: [Errno 2] No such file or directory: 'data/distracted_driver/imgs/val/c6/img_20896.jpg'

In [66]:
for i, _ in enumerate(train_batches):
    if i > train_batches.N / train_batches.batch_size + 1:
        break
print np.argmax(next(train_batches)[1], axis=1)

IOError: [Errno 2] No such file or directory: 'data/distracted_driver/imgs/val/c8/img_52997.jpg'

In [62]:
train_batches.shuffle = True
for i, _ in enumerate(train_batches):
    if i > train_batches.N / train_batches.batch_size + 1:
        break
print np.argmax(next(train_batches)[1], axis=1)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [63]:
train_batches.shuffle = True
for i, _ in enumerate(train_batches):
    if i > train_batches.N / train_batches.batch_size + 1:
        break
print np.argmax(next(train_batches)[1], axis=1)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [5]:
# training shuffle, val not shuffled didn't work
# however, for preprocessing, must not shuffle training because won't match with the labels. Fit has shuffle parameter 

In [2]:
#model.fit_generator(val_batches, val_batches.nb_sample, 1)

In [None]:
model.evaluate(val_features, val_labels)



[0.082116212921133538, 0.98170051327828611]

In [None]:
model.save_weights(model_path+'no_dropout.h5')
model.load_weights(model_path+'no_dropout.h5')

In [None]:
model.save_weights(model_path+'no_dropout.h5')
model.load_weights(model_path+'no_dropout.h5')

In [None]:
## error: accuracy is low, essentially random. Could be features are bad or labels are shuffled, no: learning rate too high
## error: splitting model to dense doesn't work: don't know why input shape is causing problem, yes: input shape incorrect

In [None]:
from matplotlib import pyplot as plt
%matplotlib inlinefrom sklearn.metrics import confusion_matrix
from utils import get_batches, plot_confusion_matrix

In [8]:
"""
# submission
import pandas as pd

test_batches = vgg.get_batches(path + 'test/', batch_size=batch_size * 2, shuffle=False)
df_filesnames = pd.DataFrame({'img': [name.split('/')[1] for name in test_batches.filenames]}) # have to give correct key name
""";

Found 79726 images belonging to 1 classes.


In [None]:
from IPython.display import FileLink

predictions = pd.DataFrame(temp, columns=vgg.classes)
#pd.read_csv('temp.csv', names=sorted(val_batches.class_indices, key=val_batches.class_indices.get))
pd.concat([df_filesnames, predictions], axis=1).to_csv(path + 'submission_2.csv', index=False)

FileLink(path + 'submission_2.csv')

In [None]:
# cannot modify layers after they have been created

from keras.layers.core import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
import numpy as np
from sklearn.preprocessing import OneHotEncoder

def get_fc_model_2(conv_layers, dense_layers, opt=None, new_dropout_p=None):
    
    def proc_wgts(layer): # don't know how to reflate the weights if given
        # different p for different layers
        return [o / 2 for o in layer.get_weights()]
    
    def make_new_layer(layer, new_dropout_p=0.0):
        if type(layer) is Dense:
            return Dense(output_dim=layer.output_dim, input_shape=
                         layer.input_shape[1:], activation=layer.activation)
        # notice the input_shape is a slice, also takes care of number of 
        # output nodes and softmax
        elif type(layer) is Dropout:
            return Dropout(p=new_dropout_p, input_shape=layer.input_shape[1:])
        else:
            raise Exception('Unexpected layer')
    
    if new_dropout_p is None:
        new_dropout_p = 0.0
    model = Sequential([make_new_layer(layer, new_dropout_p) for layer in dense_layers])

    for l1,l2 in zip(model.layers, dense_layers): 
        l1.set_weights(proc_wgts(l2))
        
    if opt is None:
        opt = Adam(lr=0.00001) # need small learning rate or else 
        # loss will increase and accuracy will decrease
    model.compile(optimizer=opt, loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model


def onehot(x): 
    return np.array(OneHotEncoder().fit_transform(x.reshape(-1, 1)).todense())


trn_labels = onehot(train_batches.classes)
val_labels = onehot(val_batches.classes)

dense_model = get_fc_model_2(conv_layers, dense_layers)
print(dense_model.evaluate(trn_features, trn_labels))
print(dense_model.evaluate(val_features, val_labels))
"""
17943/17943 [==============================] - 6s     
[1.7024225849694168, 0.8673577439703285]
4480/4481 [============================>.] - ETA: 0s[1.7144375196447119, 0.85538942200401691]
""";
# it appears that splitting the model up and evaluate will give slightly different results than
# original VGG model

# Deprecated
Unless image augmentation, pre-calculating image features through conv-layers will speed up training in dense layers.  

In [None]:
%%time
### predict_generator is faster than manual because it has queue for loading images, though sometimes gives memory error
### try not to run any other scripts while this is performing
test_batches = vgg.get_batches(path + 'test/', batch_size=batch_size * 2, shuffle=False)
temp = vgg.model.predict_generator(test_batches, test_batches.N)

Found 79726 images belonging to 1 classes.


In [None]:
"""
%%time
# manual training model is more robost than generator due to intermediate weight saves 
# 6:52
from tqdm import tqdm
import psutil; import os

test_batches = vgg.get_batches(path + 'test/', batch_size=batch_size * 2, shuffle=False)

images = []
for i in tqdm(range(int(np.ceil(test_batches.N / float(test_batches.batch_size))))):
    minibatch = next(test_batches)[0]
    images.append(vgg.model.predict_on_batch(minibatch))
#    print(psutil.Process(os.getpid()).memory_info().rss / 1e9)

temp = np.concatenate(images)
""";

In [82]:
from IPython.display import FileLink

predictions = pd.DataFrame(temp, columns=vgg.classes)
#pd.read_csv('temp.csv', names=sorted(val_batches.class_indices, key=val_batches.class_indices.get))
pd.concat([df_filesnames, predictions], axis=1).to_csv(path + 'submission_2.csv', index=False)

FileLink(path + 'submission_2.csv')

In [None]:
test_batch, test_scores = vgg.test(path + 'test/', batch_size=batch_size * 2)

In [90]:
%time val_batch, val_scores = vgg.test(path + 'val/', batch_size=batch_size * 2)

Found 4481 images belonging to 10 classes.
CPU times: user 2min 48s, sys: 24.9 s, total: 3min 13s
Wall time: 1min 50s


In [110]:
val_labels = onehot(val_batches.classes)

In [137]:
%%time
#vgg.model.evaluate(val_data, val_labels)

#vgg.model.evaluate_generator(get_batches(path + 'valid', gen, False, batch_size*2), val_batches.N)
vgg.model.evaluate_generator(val_batches, val_batches.N)
#vgg.model.evaluate(val_batch, val_labels)

CPU times: user 2min 38s, sys: 24.7 s, total: 3min 3s
Wall time: 1min 50s


[3.2667276776813283, 0.090381611247489405]

In [10]:
%%time
#vgg.model.evaluate(val_data, val_labels)

#vgg.model.evaluate_generator(get_batches(path + 'valid', gen, False, batch_size*2), val_batches.N)
vgg.model.evaluate_generator(val_batches, val_batches.N)
#vgg.model.evaluate(val_batch, val_labels)

CPU times: user 2min 51s, sys: 27.8 s, total: 3min 18s
Wall time: 2min 1s


[0.53699896356502008, 0.84601651417094403]

In [131]:
from sklearn.metrics import log_loss
import numpy as np

print(log_loss(val_labels, val_scores))
print(log_loss(val_labels, np.clip(val_scores, 0.025, 0.975)))
print(log_loss(val_labels, np.clip(val_scores, 0.05, 0.95)))

3.26672767305
2.88110645346
2.65701011186


In [14]:
for layer in vgg.model.layers:
    if type(layer) is Dense:
        layer.trainable = True
    else:
        layer.trainable = False

In [16]:
train_data = get_data('distracted_driver/imgs/train/in/')

OSError: [Errno 2] No such file or directory: 'distracted_driver/imgs/train/in/'

In [None]:
# Use batch size of 1 since we're just doing preprocessing on the CPU
val_batches = get_batches(path + 'valid', shuffle=False, batch_size=1)
batches = get_batches(path + 'train', shuffle=False, batch_size=1)

In [None]:
# np.clip; keras.metrics.categorical_crossentropy(val_labels, do_clip(val_preds, 0.93)).eval()
# save data, weights
# plot confusion
# check most right, wrong, ambivalent
# train more dense layers

In [29]:
%%capture output

print('hello panda')
print(5 + 3)
print('done')

In [30]:
output.show()

hello


In [None]:
K.categorical_crossentropy(y_pred, y_true)

In [None]:
#(val_classes, trn_classes, val_labels, trn_labels, 
#    val_filenames, filenames, test_filenames) = vgg.get_classes(path)