In [21]:
# Read in Libraries
from __future__ import division, print_function
from logbook import Logger, StreamHandler
import sys
StreamHandler(sys.stdout).push_application()
log = Logger('Logbook')
import shutil, csv, time
timestr = time.strftime("%Y%m%d")
import ujson as json
import utils; reload(utils)
from utils import *
import gc
# from __future__ import division, print_function
from theano.sandbox import cuda
from vgg16bn import Vgg16BN
from sklearn import metrics
%matplotlib inline

def accuracyfunc(y_act, y_pred):
    return metrics.accuracy_score(np.argmax(y_act, axis=1), np.argmax(y_pred, axis=1))
    
def refresh_directory_structure(name, sub_dirs):
    gdir = os.path.join(path, name)
    if os.path.exists(gdir):
        shutil.rmtree(gdir)
    os.makedirs(gdir)
    for sub_dir in sub_dirs:
        os.makedirs(os.path.join(gdir, sub_dir))

In [22]:
# Set Parameters and check files
refresh_directories = False
input_exists = True
full = False
log.info('Set Paramters')
path = "../data/fish/crop/"
batch_size=32
clip = 0.99
bags = 1
load_size = (300,300) #(360, 640)
aug_batches = 4

[2017-03-10 23:34:55.045758] INFO: Logbook: Set Paramters


In [23]:
# Read in our VGG pretrained model
log.info('Get VGG')
model = vgg_ft_bn(8)

# Create our VGG model
log.info('Create VGG')
vgg640 = Vgg16BN(load_size).model
vgg640.pop()
vgg640.input_shape, vgg640.output_shape
vgg640.compile(Adam(), 'categorical_crossentropy', metrics=['accuracy'])

# get labels
(val_classes, trn_classes, val_labels, trn_labels,
    val_filenames, filenames, test_filenames) = get_classes(path)

# Read in filenames
log.info('Read filenames')
raw_filenames = [f.split('/')[-1] for f in filenames]
raw_test_filenames = [f.split('/')[-1] for f in test_filenames]
raw_val_filenames = [f.split('/')[-1] for f in val_filenames]

[2017-03-10 23:34:55.058683] INFO: Logbook: Get VGG
[2017-03-10 23:34:58.133834] INFO: Logbook: Create VGG
Found 2685 images belonging to 8 classes.
Found 622 images belonging to 8 classes.
Found 694 images belonging to 1 classes.
[2017-03-10 23:34:58.651447] INFO: Logbook: Read filenames


In [24]:
gen_t = image.ImageDataGenerator(rotation_range=180, height_shift_range=0.05, horizontal_flip=True,
                                 # zoom_range=0.2,
                shear_range=0.05, channel_shift_range=20, width_shift_range=0.05)
da_val_batches = get_batches(path+'valid', gen_t, batch_size=batch_size, shuffle=False, target_size=load_size)
da_trn_batches = get_batches(path+'train', gen_t, batch_size=batch_size, shuffle=False, target_size=load_size)
da_tst_batches = get_batches(path+'test', gen_t, batch_size=batch_size, shuffle=False, target_size=load_size)
gc.collect()

Found 622 images belonging to 8 classes.
Found 2685 images belonging to 8 classes.
Found 694 images belonging to 1 classes.


72913

In [25]:
log.info('Read in data')
if not input_exists:
    
    # Fetch our large images 
    # Precompute the output of the convolutional part of VGG
    log.info('Fetch images')
    log.info('Get VGG output')
    log.info('Write VGG output')
    
    #log.info('Save Val Weights')
    da_conv_val_feat = vgg640.predict_generator(da_val_batches, da_val_batches.nb_sample*aug_batches)
    save_array(path+'../results/da_conv_val_feat.dat', da_conv_val_feat)
    del da_conv_val_feat
    gc.collect()
    
    #log.info('Save Trn Weights')
    da_conv_trn_feat = vgg640.predict_generator(da_trn_batches, da_trn_batches.nb_sample*aug_batches)
    save_array(path+'../results/da_conv_trn_feat.dat', da_conv_trn_feat)
    del da_conv_trn_feat
    gc.collect()
    
    val = get_data(path+'valid', load_size)
    conv_val_feat = vgg640.predict(val, batch_size=16, verbose=1)
    save_array(path+'../results/dano_conv_val_feat.dat', conv_val_feat)
    del val, conv_val_feat
    gc.collect()
    
    test = get_data(path+'test', load_size)
    conv_test_feat = vgg640.predict(test, batch_size=16, verbose=1)
    save_array(path+'../results/dano_conv_test_feat.dat', conv_test_feat)     
    del test, conv_test_feat
    gc.collect()
    gc.collect()
    
    trn = get_data(path+'train', load_size)
    conv_trn_feat = vgg640.predict(trn, batch_size=16, verbose=1)    
    del trn
    gc.collect()
    save_array(path+'../results/dano_conv_trn_feat.dat', conv_trn_feat) 
    del conv_trn_feat
    gc.collect()
    
    # For memory purposes delete out the original train and validation
    log.info('Clear up memory')
    #del trn, val, test
    gc.collect()
    gc.collect()

[2017-03-10 23:34:59.428119] INFO: Logbook: Read in data


In [26]:
# Let's include the real training data as well in its non-augmented form.
da_conv_trn_feat = load_array(path+'../results/da_conv_trn_feat.dat')
dano_conv_trn_feat = load_array(path+'../results/dano_conv_trn_feat.dat')
gc.collect()
da_conv_trn_feat = np.concatenate([da_conv_trn_feat, dano_conv_trn_feat])
del dano_conv_trn_feat 
gc.collect()

# Validation set shouldonly be augmented for a full run
da_conv_val_feat = load_array(path+'../results/dano_conv_val_feat.dat')
if full:
    dano_conv_val_feat = load_array(path+'../results/da_conv_val_feat.dat')
    da_conv_val_feat = np.concatenate([da_conv_val_feat, dano_conv_val_feat])
    del dano_conv_val_feat 
    gc.collect()

conv_test_feat = load_array(path+'../results/dano_conv_test_feat.dat')

In [27]:
# Since we've now got a dataset 3x bigger than before, we'll need to copy our labels 6 times too.
da_trn_labels = np.concatenate([trn_labels]*(aug_batches + 1))
#da_trn_bbox = np.concatenate([trn_bbox]*(aug_batches + 1))

# Validation set shouldonly be augmented for a full run
if full:
    da_val_labels = np.concatenate([val_labels]*(aug_batches + 1))
    #da_val_bbox = np.concatenate([val_bbox]*(aug_batches + 1))
else:
    da_val_labels = val_labels
    #da_val_bbox = val_bbox

In [28]:
classes = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
def fish_only(mat):
    return np.delete(mat, 4, axis=1)

trn_of_labels = fish_only(da_trn_labels)
val_of_labels = fish_only(da_val_labels)

In [29]:
trn_of_labels

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [30]:
if full:
    da_conv_trn_feat = np.concatenate([da_conv_trn_feat, da_conv_val_feat])
    trn_of_labels = np.concatenate([trn_of_labels, val_of_labels]) 
    #trn_bbox = np.concatenate([trn_bbox, val_bbox])
    
# Our Convolutional Net Architecture
log.info('Create and fit CNN')
p=0.6
# Set up the fully convolutional net (FCN); 
conv_layers,_ = split_at(vgg640, Convolution2D)
nf=128; p=0. # No dropout

[2017-03-10 23:35:17.267754] INFO: Logbook: Create and fit CNN


In [31]:
conv_layers[-1].output_shape[1:]

(512, 18, 18)

In [32]:
def create_model():
    inp = Input(conv_layers[-1].output_shape[1:])
    x = BatchNormalization(axis=1)(inp)
    x = MaxPooling2D()(x)
    x =   Dropout(p)(x)
    x = Convolution2D(nf,3,3, activation='relu', border_mode='same')(x)
    x = BatchNormalization(axis=1)(x)
    x = MaxPooling2D()(x)
    x =   Convolution2D(7,3,3, border_mode='same')(x)
    x =   Dropout(p)(x)
    x =   GlobalAveragePooling2D()(x)
    x_class = Dense(7, activation='softmax', name='class')(x)
    
    
    return inp, x_class

gc.collect()
nf = 512
p  = 0.5
batch_size=128

model, predsls, pvalsls = [], [], []

for ii in range(10):
    inp, x_class = create_model()
    model.append(Model([inp], [x_class]))
    model[ii].compile(Adam(lr=1e-3), loss=['categorical_crossentropy'], metrics=['accuracy']) # , decay=1e-6
    #model[ii].summary()
    model[ii].fit(da_conv_trn_feat, [trn_of_labels], batch_size=batch_size, nb_epoch=3, 
                 validation_data=(da_conv_val_feat, [val_of_labels]))
    
    model[ii].optimizer.lr = 1e-4
    model[ii].fit(da_conv_trn_feat, [trn_of_labels], batch_size=batch_size, nb_epoch=2, 
                 validation_data=(da_conv_val_feat, [val_of_labels]))
    count = 0
    while count < 2:
        model[ii].fit(da_conv_trn_feat, [trn_of_labels], batch_size=batch_size, nb_epoch=1, 
                     validation_data=(da_conv_val_feat, [val_of_labels]))
        predsls.append(model[ii].predict(conv_test_feat, batch_size=batch_size)) # or try 32 batch_size
        pvalsls.append(model[ii].predict(da_conv_val_feat, batch_size=batch_size))
        val_score = "%.3f" % metrics.log_loss(val_of_labels, sum(pvalsls)/len(pvalsls))
        #acc_score = "%.3f" % accuracyfunc(val_of_labels, do_clip(sum(pvalsls)/len(pvalsls), clip))
        log.info('Bagged Validation Logloss ' + str(val_score))
        #log.info('Bagged Validation Accuracy ' + str(acc_score))
        count += 1
    
    model[ii].optimizer.lr = 1e-5
    model[ii].fit(da_conv_trn_feat, [trn_of_labels], batch_size=batch_size, nb_epoch=2, 
                 validation_data=(da_conv_val_feat, [val_of_labels]))
    count = 0
    while count < 2:
        model[ii].fit(da_conv_trn_feat, [trn_of_labels], batch_size=batch_size, nb_epoch=1, 
                     validation_data=(da_conv_val_feat, [val_of_labels]))
        predsls.append(model[ii].predict(conv_test_feat, batch_size=batch_size)) # or try 32 batch_size
        pvalsls.append(model[ii].predict(da_conv_val_feat, batch_size=batch_size))
        val_score = "%.3f" % metrics.log_loss(val_of_labels, sum(pvalsls)/len(pvalsls))
        #acc_score = "%.3f" % accuracyfunc(val_of_labels, do_clip(sum(pvalsls)/len(pvalsls), clip))
        log.info('Bagged Validation Logloss ' + str(val_score))
        #log.info('Bagged Validation Accuracy ' + str(acc_score))
        count += 1
        
        
    model[ii].optimizer.lr = 1e-6
    model[ii].fit(da_conv_trn_feat, [trn_of_labels], batch_size=batch_size, nb_epoch=2, 
                 validation_data=(da_conv_val_feat, [val_of_labels]))
    count = 0
    while count < 2:
        model[ii].fit(da_conv_trn_feat, [trn_of_labels], batch_size=batch_size, nb_epoch=1, 
                     validation_data=(da_conv_val_feat, [val_of_labels]))
        predsls.append(model[ii].predict(conv_test_feat, batch_size=batch_size)) # or try 32 batch_size
        pvalsls.append(model[ii].predict(da_conv_val_feat, batch_size=batch_size))
        val_score = "%.3f" % metrics.log_loss(val_of_labels, sum(pvalsls)/len(pvalsls))
        #acc_score = "%.3f" % accuracyfunc(val_of_labels, do_clip(sum(pvalsls)/len(pvalsls), clip))
        log.info('Bagged Validation Logloss ' + str(val_score))
        #log.info('Bagged Validation Accuracy ' + str(acc_score))
        count += 1

th
Train on 16535 samples, validate on 3110 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 16535 samples, validate on 3110 samples
Epoch 1/2
Epoch 2/2
Train on 16535 samples, validate on 3110 samples
Epoch 1/1
[2017-03-10 23:38:17.184402] INFO: Logbook: Bagged Validation Logloss 0.069
Train on 16535 samples, validate on 3110 samples
Epoch 1/1
[2017-03-10 23:38:48.039072] INFO: Logbook: Bagged Validation Logloss 0.065
Train on 16535 samples, validate on 3110 samples
Epoch 1/2
Epoch 2/2
Train on 16535 samples, validate on 3110 samples
Epoch 1/1
[2017-03-10 23:40:15.006679] INFO: Logbook: Bagged Validation Logloss 0.046
Train on 16535 samples, validate on 3110 samples
Epoch 1/1
[2017-03-10 23:40:45.759452] INFO: Logbook: Bagged Validation Logloss 0.036
Train on 16535 samples, validate on 3110 samples
Epoch 1/2
Epoch 2/2
Train on 16535 samples, validate on 3110 samples
Epoch 1/1
[2017-03-10 23:42:12.771907] INFO: Logbook: Bagged Validation Logloss 0.029
Train on 16535 samples, validate on 

In [33]:
# metrics.log_loss(val_labels, do_clip(sum(pvalsls)/len(pvalsls), .9999))
preds = sum(predsls)/len(predsls)
subm = do_clip(preds, clip)

if full:
    subm_name = path+'../results/subm_full_crop_of_' + timestr + '.csv' #'.csv.gz'
else:
    subm_name = path+'../results/subm_part_crop_of_' + timestr + '.csv' #'.csv.gz'

classes = ['ALB', 'BET', 'DOL', 'LAG', 'OTHER', 'SHARK', 'YFT']
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'image', raw_test_filenames)
submission.to_csv(subm_name, index=False)#, compression='gzip')
log.info('Done - files @ ' + subm_name)

[2017-03-11 00:49:39.783825] INFO: Logbook: Done - files @ ../data/fish/crop/../results/subm_full_crop_of_20170310.csv


In [34]:
FileLink(subm_name)