In [25]:
import numpy as np

import os

os.environ['KERAS_BACKEND']='theano'

import glob
import cv2
import math
import pickle
import datetime
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, \
                                       ZeroPadding2D

# from keras.layers.normalization import BatchNormalization
# from keras.optimizers import Adam
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.models import model_from_json
# from sklearn.metrics import log_loss
from numpy.random import permutation

In [26]:
np.random.seed(2016)
use_cache = 1
# color type: 1 - grey, 3 - rgb
color_type_global = 3
img_rows = 64
img_cols = 64

In [27]:


def get_im(path, img_rows, img_cols, color_type=3):
    # Load as color
    if color_type == 1:
        img = cv2.imread(path, 0)
    elif color_type == 3:
        img = cv2.imread(path)
    # Reduce size
    resized = cv2.resize(img, (img_cols, img_rows),cv2.INTER_LINEAR)
    # mean_pixel = [103.939, 116.799, 123.68]
    # resized = resized.astype(np.float32, copy=False)

    # for c in range(3):
    #    resized[:, :, c] = resized[:, :, c] - mean_pixel[c]
    # resized = resized.transpose((2, 0, 1))
    # resized = np.expand_dims(img, axis=0)
    return resized



In [28]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded = le.fit(['LAG', 'YFT', 'OTHER', 'DOL', 'SHARK', 'NoF', 'BET', 'ALB'])

In [29]:
# def load_train(img_rows=64, img_cols=64, color_type=3):    
#     X_train = []
#     y_train = []
#     print('Read train images')
#     folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
#     for j in folders:
#         index = folders.index(j)
#         print('Load folder {}'.format(j))
#         path = os.path.join('..', 'input', 'train', j , '*.jpg')
#         files = glob.glob(path)
#         for fl in files:
#             flbase = os.path.basename(fl)
#             img = get_im(fl, img_rows, img_cols, color_type)
#             X_train.append(img)
#             y_train.append(index)
# #     y_train = le.transform(y_train)
# #     y_train = LabelEncoder().fit_transform(y_train)
# #     y_train = np_utils.to_categorical(y_train)
#     return X_train, y_train

def get_im_cv2(path):
    img = cv2.imread(path)
    resized = cv2.resize(img, (64, 64), cv2.INTER_LINEAR)
    return resized

def load_train():
    X_train = []
    X_train_id = []
    y_train = []
    start_time = time.time()

    print('Read train images')
    folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
    for fld in folders:
        index = folders.index(fld)
        print('Load folder {} (Index: {})'.format(fld, index))
        path = os.path.join('/nfs/science/shared/ipythonNotebooks/anantk/Kgl/Fish/train/', 'train', fld, '*.jpg')
        files = glob.glob(path)
        for fl in files:
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl)
            X_train.append(img)
            X_train_id.append(flbase)
            y_train.append(index)

    print('Read train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_train, y_train, X_train_id

In [30]:
def cache_data(data, path):
    if not os.path.isdir('cache'):
        os.mkdir('cache')
    if os.path.isdir(os.path.dirname(path)):
        file = open(path, 'wb')
        pickle.dump(data, file)
        file.close()
    else:
        print('Directory doesnt exists')
        
def restore_data(path):
    data = dict()
    if os.path.isfile(path):
        print('Restore data from pickle....')
        file = open(path, 'rb')
        data = pickle.load(file)
    return data

def save_model(model, index, cross=''):
    json_string = model.to_json()
    if not os.path.isdir('cache'):
        os.mkdir('cache')
    json_name = 'architecture' + str(index) + cross + '.json'
    weight_name = 'model_weights' + str(index) + cross + '.h5'
    open(os.path.join('cache', json_name), 'w').write(json_string)
    model.save_weights(os.path.join('cache', weight_name), overwrite=True)


def read_model(index, cross=''):
    json_name = 'architecture' + str(index) + cross + '.json'
    weight_name = 'model_weights' + str(index) + cross + '.h5'
    model = model_from_json(open(os.path.join('cache', json_name)).read())
    model.load_weights(os.path.join('cache', weight_name))
    return model


def split_validation_set(train, target, test_size):
    random_state = 1234
    X_train, X_test, y_train, y_test = \
        train_test_split(train, target,
                         test_size=test_size,
                         random_state=random_state)
    return X_train, X_test, y_train, y_test


def dict_to_list(d):
    ret = []
    for i in d.items():
        ret.append(i[1])
    return ret


def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()


def merge_several_folds_geom(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a *= np.array(data[i])
    a = np.power(a, 1/nfolds)
    return a.tolist()

In [31]:
def load_test(img_rows, img_cols, color_type=1):
    print('Read test images')
    path = os.path.join('..', 'input','test_stg1', '*.jpg')
    files = glob.glob(path)
    X_test = []
    X_test_id = []
    total = 0
    thr = math.floor(len(files)/100)
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im(fl, img_rows, img_cols, color_type)
        X_test.append(img)
        X_test_id.append(flbase)
        total += 1
        if total % thr == 0:
            print('Read {} images from {}'.format(total, len(files)))

    return X_test, X_test_id


def create_submission(predictions, test_id, info):
    result1 = pd.DataFrame(predictions, columns=['image', 'LAG', 'YFT', 'OTHER', 'DOL', 'SHARK', 'NoF', 'BET', 'ALB'])
    result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
    now = datetime.datetime.now()
    if not os.path.isdir('subm'):
        os.mkdir('subm')
    suffix = info + '_' + str(now.strftime("%Y-%m-%d-%H-%M"))
    sub_file = os.path.join('subm', 'submission_' + suffix + '.csv')
    result1.to_csv(sub_file, index=False)

In [46]:
def vgg_std16_model(img_rows, img_cols, color_type=3):
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(color_type,
                                                 img_rows, img_cols),dim_ordering='th'))
    model.add(Convolution2D(64, 3, 3, activation='relu',dim_ordering='th'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(64, 3, 3, activation='relu',dim_ordering='th'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1),dim_ordering='th'))
    model.add(Convolution2D(128, 3, 3, activation='relu',dim_ordering='th'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu',dim_ordering='th'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1),dim_ordering='th'))
    model.add(Convolution2D(256, 3, 3, activation='relu',dim_ordering='th'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu',dim_ordering='th'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu',dim_ordering='th'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1),dim_ordering='th'))
    model.add(Convolution2D(512, 3, 3, activation='relu',dim_ordering='th'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu',dim_ordering='th'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu',dim_ordering='th'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1),dim_ordering='th'))
    model.add(Convolution2D(512, 3, 3, activation='relu',dim_ordering='th'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu',dim_ordering='th'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu',dim_ordering='th'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2),dim_ordering='th'))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))

    model.load_weights('/nfs/science/shared/ipythonNotebooks/anantk/vgg16.h5')

    # Code above loads pre-trained data and
    model.layers.pop()
    model.add(Dense(8, activation='softmax'))
    # Learning rate is changed to 0.001
    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy')
    print('model loading and compilation finished succesfully')
    return model

In [36]:
# def read_and_normalize_and_shuffle_train_data(img_rows=64, img_cols=64,
#                                               color_type=3):

#     cache_path = os.path.join('cache', 'train_r_' + str(img_rows) +
#                               '_c_' + str(img_cols) + '_t_' +
#                               str(color_type) + '.dat')

#     if not os.path.isfile(cache_path) or use_cache == 0:
#         train_data, train_target = load_train(img_rows, img_cols, color_type)
#         cache_data((train_data, train_target),cache_path)
#     else:
#         print('Restore train from cache!')
#         (train_data, train_target) = restore_data(cache_path)
# #     print('Convert to numpy...')
# #     train_data = np.array(train_data, dtype=np.uint8)
# #     train_target = np.array(train_target, dtype=np.uint8)
        
#     train_data = np.array(train_data, dtype=np.uint8)    
#     train_target = np.array(train_target, dtype=np.uint8)

#     train_data = train_data.transpose((0, 3, 1, 2))

#     train_target = np_utils.to_categorical(train_target, 8)
#     train_data = train_data.astype('float32')
    
#     ## check mean pixel value
#     mean_pixel = [103.939, 116.779, 123.68]
#     for c in range(3):
#         train_data[:, c, :, :] = train_data[:, c, :, :] - mean_pixel[c]
#     # train_data /= 255
#     perm = permutation(len(train_target))
#     train_data = train_data[perm]
#     train_target = train_target[perm]
#     print('Train shape:', train_data.shape)
#     print(train_data.shape[0], 'train samples')
#     return train_data, train_target


# def read_and_normalize_test_data(img_rows=64, img_cols=64, color_type=3):
#     cache_path = os.path.join('cache', 'test_r_' + str(img_rows) +
#                               '_c_' + str(img_cols) + '_t_' +
#                               str(color_type) + '.dat')
#     if not os.path.isfile(cache_path) or use_cache == 0:
#         test_data, test_id = load_test(img_rows, img_cols, color_type)
#         cache_data((test_data, test_id), cache_path)
#     else:
#         print('Restore test from cache!')
#         (test_data, test_id) = restore_data(cache_path)

#     test_data = np.array(test_data, dtype=np.uint8)

#     test_data = test_data.transpose((0, 3, 1, 2))

#     test_data = test_data.astype('float32')
#     mean_pixel = [103.939, 116.779, 123.68]
#     for c in range(3):
#         test_data[:, c, :, :] = test_data[:, c, :, :] - mean_pixel[c]
#     # test_data /= 255
#     print('Test shape:', test_data.shape)
#     print(test_data.shape[0], 'test samples')
#     return test_data, test_id


def read_and_normalize_train_data():
    train_data, train_target, train_id = load_train()

    print('Convert to numpy...')
    train_data = np.array(train_data, dtype=np.uint8)
    train_target = np.array(train_target, dtype=np.uint8)

    print('Reshape...')
    train_data = train_data.transpose((0, 3, 1, 2))

    print('Convert to float...')
    train_data = train_data.astype('float32')
    train_data = train_data / 255
    train_target = np_utils.to_categorical(train_target, 8)

    print('Train shape:', train_data.shape)
    print(train_data.shape[0], 'train samples')
    return train_data, train_target, train_id


def read_and_normalize_test_data():
    start_time = time.time()
    test_data, test_id = load_test()

    test_data = np.array(test_data, dtype=np.uint8)
    test_data = test_data.transpose((0, 3, 1, 2))

    test_data = test_data.astype('float32')
    test_data = test_data / 255

    print('Test shape:', test_data.shape)
    print(test_data.shape[0], 'test samples')
    print('Read and process test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return test_data, test_id


In [42]:
def run_cross_validation(nfolds=10, nb_epoch=10, split=0.2, modelStr='initial'):

    # Now it loads color image
    # input image dimensions
    img_rows, img_cols = 64, 64
    batch_size = 16
    random_state = 20

    train_data, train_target, train_id = read_and_normalize_train_data()

#         read_and_normalize_and_shuffle_train_data(64, 64,3)
        
    # ishuf_train_data = []
    # shuf_train_target = []
    # index_shuf = range(len(train_target))
    # shuffle(index_shuf)
    # for i in index_shuf:
    #     shuf_train_data.append(train_data[i])
    #     shuf_train_target.append(train_target[i])

    # yfull_train = dict()
    # yfull_test = []
    num_fold = 0
    kf = KFold(len(train_target), n_folds=nfolds,
               shuffle=True, random_state=random_state)
    for train_imgs, test_imgs in kf:
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        # print('Split train: ', len(X_train), len(Y_train))
        # print('Split valid: ', len(X_valid), len(Y_valid))
        # print('Train drivers: ', unique_list_train)
        # print('Test drivers: ', unique_list_valid)
        # model = create_model_v1(img_rows, img_cols, color_type_global)
        # model = vgg_bn_model(img_rows, img_cols, color_type_global)
        model = vgg_std16_model(64, 64, 3)
        # correct validation to kfold rather than singel fold
        model.fit(train_data, train_target, batch_size=batch_size,
                  nb_epoch=nb_epoch,
                  show_accuracy=True, verbose=1,
                  validation_split=split, shuffle=True)

        # print('losses: ' + hist.history.losses[-1])

        # print('Score log_loss: ', score[0])

        save_model(model, num_fold, modelStr)

        # predictions_valid = model.predict(X_valid, batch_size=128, verbose=1)
        # score = log_loss(Y_valid, predictions_valid)
        # print('Score log_loss: ', score)
        # Store valid predictions
        # for i in range(len(test_index)):
        #    yfull_train[test_index[i]] = predictions_valid[i]

    print('Start testing............')
#     test_data, test_id = read_and_normalize_test_data(64, 64,3)
    test_data, test_id = read_and_normalize_test_data()
    yfull_test = []

    for index in range(1, num_fold + 1):
        # 1,2,3,4,5
        # Store test predictions
        model = read_model(index, modelStr)
        test_prediction = model.predict(test_data, batch_size=128, verbose=1)
        yfull_test.append(test_prediction)

    info_string = 'loss_' + modelStr \
                  + '_r_' + str(img_rows) \
                  + '_c_' + str(img_cols) \
                  + '_folds_' + str(nfolds) \
                  + '_ep_' + str(nb_epoch)

    test_res = merge_several_folds_mean(yfull_test, nfolds)
    create_submission(test_res, test_id, info_string)

In [47]:
import time
from keras import backend as K
K.set_image_dim_ordering('th')
# nfolds, nb_epoch, split
run_cross_validation(3, 20, 0.15, '_vgg_16_2x20')

# nb_epoch, split
# run_one_fold_cross_validation(10, 0.1)

# test_model_and_submit(1, 10, 'high_epoch')

Read train images
Load folder ALB (Index: 0)
Load folder BET (Index: 1)
Load folder DOL (Index: 2)
Load folder LAG (Index: 3)
Load folder NoF (Index: 4)
Load folder OTHER (Index: 5)
Load folder SHARK (Index: 6)
Load folder YFT (Index: 7)
Read train data time: 60.5 seconds
Convert to numpy...
Reshape...
Convert to float...
('Train shape:', (3777, 3, 64, 64))
(3777, 'train samples')
Start KFold number 1 from 3
model loading and compilation finished succesfully
Train on 3210 samples, validate on 567 samples
Epoch 1/20


ValueError: Shape mismatch: x has 2048 cols (and 16 rows) but y has 25088 rows (and 4096 cols)
Apply node that caused the error: Dot22(Reshape{2}.0, dense_9_W)
Toposort index: 203
Inputs types: [TensorType(float32, matrix), TensorType(float32, matrix)]
Inputs shapes: [(16, 2048), (25088, 4096)]
Inputs strides: [(8192, 4), (16384, 4)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Elemwise{add,no_inplace}(Dot22.0, InplaceDimShuffle{x,0}.0), Elemwise{Composite{Switch(i0, (Composite{(Abs(i0) + i1 + i2)}(i1, i2, i3) * i4), (i5 * Composite{(Abs(i0) + i1 + i2)}(i1, i2, i3)))}}[(0, 2)](InplaceDimShuffle{x,x}.0, Elemwise{add,no_inplace}.0, Dot22.0, InplaceDimShuffle{x,0}.0, Elemwise{Composite{Cast{float32}(LT(i0, i1))}}[(0, 0)].0, TensorConstant{(1, 1) of 0.5})]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.