## Train sea lion classifier with a convnet

In [1]:
INPUT_DIR = '../../input/kaggle-sea-lion/02/'
OUTPUT_DIR = '../../output/kaggle-sea-lion/05/'
#IMAGE_DIMS = (148,148,3)
IMAGE_DIMS = (84,84,3)
#IMAGE_DIMS = (32,32,3)

INPUT_DATASET_NAME = 'lion-patches-30px'
SAVE_WEIGHTS_FILE = OUTPUT_DIR + 'last-weights.h5'
LOAD_WEIGHTS_FILE = None

BATCH_SIZE=128

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import sklearn
import os
import glob

import keras
from keras.preprocessing.image import ImageDataGenerator

from modules.logging import logger
import modules.utils as utils
from modules.utils import Timer
import modules.logging
import modules.cnn as cnn
import modules.lions as lions

Using TensorFlow backend.


## Prepare

### Prepare output dir

In [3]:
utils.mkdirs(OUTPUT_DIR, dirs=['tf-logs','weights'], recreate=False)
modules.logging.setup_file_logger(OUTPUT_DIR + 'out.log')
TF_LOGS_DIR = OUTPUT_DIR + 'tf-logs/'
WEIGHTS_DIR = OUTPUT_DIR + 'weights/'
input_dataset_path = INPUT_DIR + utils.dataset_name(INPUT_DATASET_NAME, IMAGE_DIMS)

logger.info('Output dirs created')

2017-04-15 03:54:07,558 INFO Output dirs created


### Prepare CNN model

In [4]:
logger.info('Load CNN model')
#model = lions.convnet_alexnet2_lion_keras(IMAGE_DIMS)
model = lions.convnet_simple_lion_keras(IMAGE_DIMS)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

2017-04-15 03:54:07,566 INFO Load CNN model


### Prepare train, validate and test data flows

In [5]:
logger.info('Using dataset ' + input_dataset_path + ' as input')
h5file = h5py.File(input_dataset_path, 'r')

logger.info('train data')
train_batch_generator = utils.BatchGeneratorXYH5(h5file, start_ratio=0, end_ratio=1, batch_size=BATCH_SIZE)

logger.info('train size ' + str(train_batch_generator.size) + ' ' + str(train_batch_generator.nr_batches))
image_randomize_generator = ImageDataGenerator(
        featurewise_center=True,
        samplewise_center=False,
        featurewise_std_normalization=True,
        samplewise_std_normalization=False,
        zca_whitening=False,
        rotation_range=360,
        width_shift_range=0,
        height_shift_range=0,
        horizontal_flip=False,
        vertical_flip=False)
train_balance_generator = utils.ClassBalancerGeneratorXY(train_batch_generator.flow(),
                                                         train_batch_generator.size,
                                                         start_ratio=0,
                                                         end_ratio=0.02,
                                                         batch_size=BATCH_SIZE,
                                                         classes_distribution_weight=(1,1,1,1,1,1), 
                                                         max_augmentation_ratio=0, 
                                                         max_undersampling_ratio=0, 
                                                         image_augmentation=image_randomize_generator)
train_generator = utils.image_augmentation_xy(train_balance_generator.flow(), image_randomize_generator)


logger.info('dump generator data')
dataset_path = INPUT_DIR + utils.dataset_name(INPUT_DATASET_NAME, IMAGE_DIMS)
X_train, Y_train = utils.dump_xy_to_array(train_balance_generator.flow(), train_balance_generator.size)
print(np.shape(X_train))
print(np.shape(Y_train))
print(utils.class_distribution(Y_train).astype('uint'))


print('show images from generator')
c = 0
for xs,ys in train_balance_generator.flow():
    if(c>=3):
        break
    for x in xs:
        if(c>=3):
            break
        print(np.shape(x))
        utils.show_image(x, is_bgr=True)
        c += 1

logger.info('validate data')
validate_batch_generator = utils.BatchGeneratorXYH5(h5file, start_ratio=0.99, end_ratio=1, batch_size=BATCH_SIZE)
logger.info('validate size ' + str(validate_batch_generator.size) + ' ' + str(validate_batch_generator.nr_batches))
validate_generator = validate_batch_generator

2017-04-15 03:54:07,770 INFO Using dataset ../../input/kaggle-sea-lion/02/lion-patches-30px-84-84.h5 as input
2017-04-15 03:54:07,772 INFO train data
2017-04-15 03:54:07,774 INFO train size 74240 580.0
2017-04-15 03:54:07,776 INFO analysing input data class distribution
2017-04-15 03:54:07,777 INFO > [started] generator dump...


74240/74240

2017-04-15 03:54:32,550 INFO > [done]    generator dump (24772.583 ms)
2017-04-15 03:54:32,592 INFO raw sample class distribution
2017-04-15 03:54:32,593 INFO 0: 9860
2017-04-15 03:54:32,594 INFO 1: 1160
2017-04-15 03:54:32,596 INFO 2: 38860
2017-04-15 03:54:32,597 INFO 3: 4060
2017-04-15 03:54:32,598 INFO 4: 20300
2017-04-15 03:54:32,599 INFO 5: 0
2017-04-15 03:54:32,601 INFO overall items per class: 38860
2017-04-15 03:54:32,602 INFO augmentation/undersampling ratio per class
2017-04-15 03:54:32,603 INFO 0: 3.94117647059
2017-04-15 03:54:32,604 INFO 1: 33.5
2017-04-15 03:54:32,605 INFO 2: 1.0
2017-04-15 03:54:32,607 INFO 3: 9.57142857143
2017-04-15 03:54:32,608 INFO 4: 1.91428571429
2017-04-15 03:54:32,609 INFO 5: 0.0
2017-04-15 03:54:32,610 INFO dump generator data
2017-04-15 03:54:32,612 INFO > [started] generator dump...
2017-04-15 03:54:32,613 INFO generating next batch 1


NameError: name 'image_data_generator' is not defined

## Train model

In [None]:
if(LOAD_WEIGHTS_FILE!=None and os.path.isfile(LOAD_WEIGHTS_FILE)):
    logger.info('Loading previous weights...')
    model.load_weights(WEIGHTS_FILE)

logger.info('Starting CNN training...')

history = model.fit_generator(train_generator,
              steps_per_epoch = 1,
              nb_epoch = 1, 
              callbacks = cnn.get_callbacks_keras(model, WEIGHTS_DIR, TF_LOGS_DIR),
              validation_data = validate_generator.flow(), 
              validation_steps = 1,
              verbose = 1)

if(SAVE_WEIGHTS_FILE!=None):
    logger.info('Saving last weights...')
    model.save_weights(SAVE_WEIGHTS_FILE)

### Epoch accuracy/loss

In [None]:
logger.info('Training info')
cnn.show_training_info_keras(history)

### Confusion matrix

In [None]:
from sklearn import metrics
dataset_path = INPUT_DIR + utils.dataset_name(INPUT_DATASET_NAME, IMAGE_DIMS)
with h5py.File(input_dataset_path, 'r') as h5file:
    test_batch_generator = utils.BatchGeneratorXYH5(h5file, start_ratio=0.7, end_ratio=0.71, batch_size=10)
    cnn.evaluate_dataset_keras(test_batch_generator.flow(), test_batch_generator.nr_batches, test_batch_generator.size, model)