Source: https://www.kaggle.com/blackcore/the-nature-conservancy-fisheries-monitoring/fish-keras-test.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
import os
import glob
import cv2
from tqdm import tqdm, tqdm_notebook
import datetime
import time
import warnings
# warnings.filterwarnings("ignore")

In [3]:
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import (Convolution2D, MaxPooling2D,
                                        ZeroPadding2D, AveragePooling2D)
from keras.optimizers import SGD, Adagrad
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.constraints import maxnorm
from sklearn.metrics import log_loss
from keras import __version__ as keras_version

Using TensorFlow backend.


In [4]:
print('Keras version: {}'.format(keras_version))

Keras version: 1.2.0


In [5]:
np.random.seed(1989)

In [6]:
def get_im_cv2(path):
    img = cv2.imread(path)[:,:,::-1] # BGR -> RGB
    resized = cv2.resize(img, (64, 64), cv2.INTER_LINEAR)
    return resized

In [7]:
def load_train():
    X_train = []
    X_train_id = []
    y_train = []
    start_time = time.time()

    tqdm.write('Read train images')
    folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
    for fld in folders:
        index = folders.index(fld)
        tqdm.write('Load folder {} (Index: {})'.format(fld, index))
        path = os.path.join('..', 'data', 'train', fld, '*.jpg')
        files = glob.glob(path)
        for fl in tqdm_notebook(files):
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl)
            X_train.append(img)
            X_train_id.append(flbase)
            y_train.append(index)

    tqdm.write('Read train data time: {} seconds'.format(
        round(time.time() - start_time, 2)))
    return X_train, y_train, X_train_id

In [8]:
def load_test():
    path = os.path.join('..', 'data', 'test_stg1', '*.jpg')
    files = sorted(glob.glob(path))

    X_test = []
    X_test_id = []
    for fl in tqdm_notebook(files):
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_test.append(img)
        X_test_id.append(flbase)

    return X_test, X_test_id

In [9]:
def create_submission(predictions, test_id, info):
    result1 = pd.DataFrame(predictions,
                           columns=['ALB', 'BET', 'DOL', 'LAG',
                                    'NoF', 'OTHER', 'SHARK', 'YFT'])
    result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
    now = datetime.datetime.now()
    sub_file = ('../data/submission/' + 'submission_' + info + '_'
                + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv')
    result1.to_csv(sub_file, index=False)

In [10]:
def read_and_normalize_train_data():
    train_data, train_target, train_id = load_train()

    print('Convert to numpy...')
    train_data = np.array(train_data, dtype=np.uint8)
    train_target = np.array(train_target, dtype=np.uint8)

    print('Reshape...')
    #uncomment for theano
    #train_data = train_data.transpose((0, 3, 1, 2))

    print('Convert to float...')
    train_data = train_data.astype('float32')
    train_data = train_data / 255
    train_target = np_utils.to_categorical(train_target, 8)

    print('Train shape:', train_data.shape)
    print(train_data.shape[0], 'train samples')
    return train_data, train_target, train_id

In [11]:
def read_and_normalize_test_data():
    start_time = time.time()
    test_data, test_id = load_test()

    test_data = np.array(test_data, dtype=np.uint8)
    #uncomment for theano
    #test_data = test_data.transpose((0, 3, 1, 2))

    test_data = test_data.astype('float32')
    test_data = test_data / 255

    print('Test shape:', test_data.shape)
    print(test_data.shape[0], 'test samples')
    print('Read and process test data time: {} seconds'.format(
        round(time.time() - start_time, 2)))
    return test_data, test_id

In [12]:
def dict_to_list(d):
    ret = []
    for i in d.items():
        ret.append(i[1])
    return ret

In [13]:
def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()

In [14]:
def create_model():
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(64, 64, 3),
                            dim_ordering='tf'))
    model.add(Convolution2D(8, 3, 3, activation='relu',
                            dim_ordering='tf', init='he_uniform'))
    model.add(Dropout(0.2))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2),
                           dim_ordering='tf'))
    model.add(ZeroPadding2D((1, 1), dim_ordering='tf'))
    model.add(Convolution2D(16, 3, 3, activation='relu',
                            dim_ordering='tf', init='he_uniform'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2),
                           dim_ordering='tf'))
    model.add(Dropout(0.2))
    
    model.add(Flatten())
    model.add(Dense(96, activation='relu', init='he_uniform'))
    model.add(Dropout(0.4))
    model.add(Dense(24, activation='relu', init='he_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='softmax'))

    sgd = SGD(lr=1e-2, decay=1e-4, momentum=0.88, nesterov=False)
    model.compile(optimizer=sgd, loss='categorical_crossentropy')

    return model

In [15]:
def get_validation_predictions(train_data, predictions_valid):
    pv = []
    for i in range(len(train_data)):
        pv.append(predictions_valid[i])
    return pv

In [16]:
def run_cross_validation_create_models(train_data, train_target, train_id,
                                       nfolds=10):
    # input image dimensions
    batch_size = 32
    nb_epoch = 8
    random_state = 51
    first_rl = 96

    yfull_train = dict()
    kf = KFold(n_splits=nfolds, shuffle=True,
               random_state=random_state)
    num_fold = 0
    sum_score = 0
    models = []
    for train_index, test_index in kf.split(range(len(train_id))):
        model = create_model()
        X_train = train_data[train_index]
        Y_train = train_target[train_index]
        X_valid = train_data[test_index]
        Y_valid = train_target[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=3, verbose=0),
        ]
        model.fit(X_train, Y_train, batch_size=batch_size,
                  nb_epoch=nb_epoch, shuffle=True, verbose=2,
                  validation_data=(X_valid, Y_valid), callbacks=callbacks)

        predictions_valid = model.predict(X_valid.astype('float32'),
                                          batch_size=batch_size, verbose=2)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score*len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        models.append(model)

    score = sum_score/len(train_data)
    print("Log_loss train independent avg: ", score)

    info_string = ('_' + str(np.round(score,3)) + '_flds_' + str(nfolds)
                   + '_eps_' + str(nb_epoch) + '_fl_' + str(first_rl))
    return info_string, models

In [17]:
def run_cross_validation_process_test(info_string, models):
    batch_size = 24
    num_fold = 0
    yfull_test = []
    test_id = []
    nfolds = len(models)
    test_data, test_id = read_and_normalize_test_data()

    for i in range(nfolds):
        model = models[i]
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        test_prediction = model.predict(test_data,
                                        batch_size=batch_size, verbose=2)
        yfull_test.append(test_prediction)

    test_res = merge_several_folds_mean(yfull_test, nfolds)
    info_string = 'loss_' + info_string + '_folds_' + str(nfolds)
    create_submission(test_res, test_id, info_string)

In [18]:
num_folds = 3
train_data, train_target, train_id = read_and_normalize_train_data()

Read train images
Load folder ALB (Index: 0)

Load folder BET (Index: 1)

Load folder DOL (Index: 2)

Load folder LAG (Index: 3)

Load folder NoF (Index: 4)

Load folder OTHER (Index: 5)

Load folder SHARK (Index: 6)

Load folder YFT (Index: 7)

Read train data time: 103.3 seconds
Convert to numpy...
Reshape...
Convert to float...
Train shape: (3777, 64, 64, 3)
3777 train samples


In [19]:
info_string, models = run_cross_validation_create_models(
    train_data, train_target, train_id, num_folds)

Start KFold number 1 from 3
Split train:  2518 2518
Split valid:  1259 1259
Train on 2518 samples, validate on 1259 samples
Epoch 1/8
2s - loss: 1.6843 - val_loss: 1.5718
Epoch 2/8
0s - loss: 1.5357 - val_loss: 1.4274
Epoch 3/8
0s - loss: 1.3770 - val_loss: 1.2500
Epoch 4/8
0s - loss: 1.2660 - val_loss: 1.2107
Epoch 5/8
0s - loss: 1.1478 - val_loss: 1.0646
Epoch 6/8
0s - loss: 1.0084 - val_loss: 0.8901
Epoch 7/8
0s - loss: 0.8832 - val_loss: 0.7676
Epoch 8/8
0s - loss: 0.7912 - val_loss: 0.6779
Score log_loss:  0.677940759999
Start KFold number 2 from 3
Split train:  2518 2518
Split valid:  1259 1259
Train on 2518 samples, validate on 1259 samples
Epoch 1/8
1s - loss: 1.7077 - val_loss: 1.5981
Epoch 2/8
0s - loss: 1.5533 - val_loss: 1.4235
Epoch 3/8
0s - loss: 1.4160 - val_loss: 1.2790
Epoch 4/8
0s - loss: 1.2519 - val_loss: 1.1230
Epoch 5/8
0s - loss: 1.1213 - val_loss: 1.0233
Epoch 6/8
0s - loss: 0.9412 - val_loss: 0.8254
Epoch 7/8
0s - loss: 0.8179 - val_loss: 0.6892
Epoch 8/8
0s - 

In [20]:
run_cross_validation_process_test(info_string, models)


Test shape: (1000, 64, 64, 3)
1000 test samples
Read and process test data time: 27.42 seconds
Start KFold number 1 from 3
Start KFold number 2 from 3
Start KFold number 3 from 3
