In [2]:
import os, cv2, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

os.environ['KERAS_BACKEND']='theano'

import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
%matplotlib inline 

from keras.models import Sequential
from keras.layers import Dropout, Flatten, Convolution2D, MaxPooling2D, ZeroPadding2D, Dense, Activation
from keras.optimizers import RMSprop, Adam
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras import backend as K

TRAIN_DIR = '/nfs/science/shared/ipythonNotebooks/anantk/Kgl/Fish/train/train/'
TEST_DIR = '/nfs/science/shared/ipythonNotebooks/anantk/Kgl/Fish/test_stg1/'
FISH_CLASSES = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
ROWS = 90  #720
COLS = 160 #1280
CHANNELS = 3

Using Theano backend.


In [57]:
def get_images(fish):
    """Load files from train folder"""
    fish_dir = TRAIN_DIR+'{}'.format(fish)
    images = [fish+'/'+im for im in os.listdir(fish_dir)]
    return images

def read_image(src):
    """Read and resize individual images"""
    im = cv2.imread(src, cv2.IMREAD_COLOR)
    im = cv2.resize(im, (COLS, ROWS), interpolation=cv2.INTER_CUBIC)
    return im


files = []
y_all = []

for fish in FISH_CLASSES:
    fish_files = get_images(fish)
    files.extend(fish_files)
    
    y_fish = np.tile(fish, len(fish_files))
    y_all.extend(y_fish)
    print("{0} photos of {1}".format(len(fish_files), fish))
    
y_all = np.array(y_all)



1719 photos of ALB
200 photos of BET
117 photos of DOL
67 photos of LAG
465 photos of NoF
299 photos of OTHER
176 photos of SHARK
734 photos of YFT


In [32]:
X_all = np.ndarray((len(files), ROWS, COLS, CHANNELS), dtype=np.uint8)

for i, im in enumerate(files): 
    X_all[i] = read_image(TRAIN_DIR+im)
    if i%1000 == 0: print('Processed {} of {}'.format(i, len(files)))

print(X_all.shape)

Processed 0 of 3777
Processed 1000 of 3777
Processed 2000 of 3777
Processed 3000 of 3777
(3777, 90, 160, 3)


In [33]:
y_all.shape

# print('Convert to float...')
X_all = X_all.astype('float32')
X_all = X_all / 255
# train_target = np_utils.to_categorical(train_target, 8)

In [54]:
# One Hot Encoding Labels
y_all = LabelEncoder().fit_transform(y_all)
y_all = np_utils.to_categorical(y_all)

# X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, 
#                                                     test_size=0.2, random_state=23, 
#                                                     stratify=y_all)


In [59]:
y_all.shape

(3777,)

In [38]:
optimizer = RMSprop(lr=1e-4)
objective = 'categorical_crossentropy'

def center_normalize(x):
    return (x - K.mean(x)) / K.std(x)

model = Sequential()

# model.add(Activation(activation=center_normalize, input_shape=(32, 32, CHANNELS)))

model.add(ZeroPadding2D((1, 1), input_shape=(3, 32, 32), dim_ordering='th'))
model.add(Convolution2D(32, 5, 5, border_mode='same', activation='relu', dim_ordering='th'))
model.add(Convolution2D(32, 5, 5, border_mode='same', activation='relu', dim_ordering='th'))
model.add(MaxPooling2D(pool_size=(2, 2), dim_ordering='th'))

model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu', dim_ordering='th'))
model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu', dim_ordering='th'))
model.add(MaxPooling2D(pool_size=(2, 2), dim_ordering='th'))

model.add(Convolution2D(128, 3, 3, border_mode='same', activation='relu', dim_ordering='th'))
model.add(Convolution2D(128, 3, 3, border_mode='same', activation='relu', dim_ordering='th'))
model.add(MaxPooling2D(pool_size=(2, 2), dim_ordering='th'))

model.add(Convolution2D(256, 3, 3, border_mode='same', activation='relu', dim_ordering='th'))
model.add(Convolution2D(256, 3, 3, border_mode='same', activation='relu', dim_ordering='th'))
model.add(MaxPooling2D(pool_size=(2, 2), dim_ordering='th'))


model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(len(FISH_CLASSES)))
model.add(Activation('sigmoid'))

model.compile(loss=objective, optimizer=optimizer)

# def create_model():
#     model = Sequential()
#     model.add(ZeroPadding2D((1, 1), input_shape=(3, 32, 32), dim_ordering='th'))
#     model.add(Convolution2D(4, 3, 3, activation='relu', dim_ordering='th'))
#     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
#     model.add(Convolution2D(4, 3, 3, activation='relu', dim_ordering='th'))
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))

#     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
#     model.add(Convolution2D(8, 3, 3, activation='relu', dim_ordering='th'))
#     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
#     model.add(Convolution2D(8, 3, 3, activation='relu', dim_ordering='th'))
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))

#     model.add(Flatten())
#     model.add(Dense(32, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(32, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(8, activation='softmax'))

#     sgd = SGD(lr=1e-2, decay=1e-6, momentum=0.9, nesterov=True)
#     model.compile(optimizer=sgd, loss='categorical_crossentropy')

# return model

In [39]:
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1, mode='auto')        
        
model.fit(X_train, y_train, batch_size=16, nb_epoch=30,
              validation_split=0.2, verbose=4, shuffle=True, callbacks=[early_stopping])



Exception: Error when checking model input: expected zeropadding2d_input_1 to have shape (None, 3, 32, 32) but got array with shape (3021, 90, 160, 3)

In [17]:
preds = model.predict(X_valid, verbose=1)
print("Validation Log Loss: {}".format(log_loss(y_valid, preds)))

Validation Log Loss: 0.207261213378


In [18]:
test_files = [im for im in os.listdir(TEST_DIR)]
test = np.ndarray((len(test_files), ROWS, COLS, CHANNELS), dtype=np.uint8)

for i, im in enumerate(test_files): 
    test[i] = read_image(TEST_DIR+im)
    
test_preds = model.predict(test, verbose=1)



In [20]:


submission = pd.DataFrame(test_preds, columns=FISH_CLASSES)
submission.insert(0, 'image', test_files)
submission.head()

submission.to_csv('/nfs/science/shared/ipythonNotebooks/anantk/Kgl/Fish/nn_2.csv', index=None)


In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import sys
# The path to XGBoost wrappers goes here
sys.path.append('C:\\Users\\Amine\\Documents\\GitHub\\xgboost\\wrapper')
import xgboost as xgb


def score(params):
    print "Training with params : "
    print params
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    predictions = model.predict(dvalid).reshape((X_test.shape[0], 9))
    score = log_loss(y_test, predictions)
    print "\tScore {0}\n\n".format(score)
    return {'loss': score, 'status': STATUS_OK}


def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             'max_depth' : hp.quniform('max_depth', 1, 13, 1),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'num_class' : 9,
             'eval_metric': 'mlogloss',
             'objective': 'multi:softprob',
             'nthread' : 6,
             'silent' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print best

# data = np.array(X_all.reshape(X_all.size), copy=False, dtype=np.float32)
# X, y = load_train()
# print "Splitting data into train and valid ...\n\n"
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size=0.2, random_state=23, 
                                                    stratify=y_all)
#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)

Training with params : 
{'colsample_bytree': 1.0, 'silent': 1, 'eval_metric': 'mlogloss', 'nthread': 6, 'min_child_weight': 4.0, 'n_estimators': 950.0, 'subsample': 0.9, 'eta': 0.375, 'objective': 'multi:softprob', 'num_class': 9, 'max_depth': 4.0, 'gamma': 0.65}


ValueError: Input numpy.ndarray must be 2 dimensional

In [3]:
# __author__ = 'ZFTurbo: https://kaggle.com/zfturbo'

import numpy as np
np.random.seed(2016)

import os
import glob
import cv2
import datetime
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.cross_validation import KFold,StratifiedKFold
# from sklearn.model_selection import StratifiedKFold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD,RMSprop
from keras.callbacks import EarlyStopping
from keras.regularizers import l2, activity_l2
from keras.utils import np_utils
from sklearn.metrics import log_loss
from keras import __version__ as keras_version



def get_im_cv2(path):
    img = cv2.imread(path)
    resized = cv2.resize(img, (64, 64), cv2.INTER_LINEAR)
    return resized


def load_train():
    X_train = []
    X_train_id = []
    y_train = []
    start_time = time.time()

    print('Read train images')
    folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
    for fld in folders:
        index = folders.index(fld)
        print('Load folder {} (Index: {})'.format(fld, index))
        path = os.path.join('/nfs/science/shared/ipythonNotebooks/anantk/Kgl/Fish/train/', 'train', fld, '*.jpg')
        files = glob.glob(path)
        for fl in files:
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl)
            X_train.append(img)
            X_train_id.append(flbase)
            y_train.append(index)

    print('Read train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_train, y_train, X_train_id


def load_test():
    path = os.path.join('/nfs/science/shared/ipythonNotebooks/anantk/Kgl/Fish/', 'test_stg1', '*.jpg')
    files = sorted(glob.glob(path))

    X_test = []
    X_test_id = []
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_test.append(img)
        X_test_id.append(flbase)

    return X_test, X_test_id


def create_submission(predictions, test_id, info):
    result1 = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])
    result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
    now = datetime.datetime.now()
    sub_file = 'submission_' + info + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    result1.to_csv(sub_file, index=False)


def read_and_normalize_train_data():
    train_data, train_target, train_id = load_train()

    print('Convert to numpy...')
    train_data = np.array(train_data, dtype=np.uint8)
    train_target = np.array(train_target, dtype=np.uint8)

    print('Reshape...')
    train_data = train_data.transpose((0, 3, 1, 2))

    print('Convert to float...')
    train_data = train_data.astype('float32')
    train_data = train_data / 255
    train_target = np_utils.to_categorical(train_target, 8)

    print('Train shape:', train_data.shape)
    print(train_data.shape[0], 'train samples')
    return train_data, train_target, train_id


def read_and_normalize_test_data():
    start_time = time.time()
    test_data, test_id = load_test()

    test_data = np.array(test_data, dtype=np.uint8)
    test_data = test_data.transpose((0, 3, 1, 2))

    test_data = test_data.astype('float32')
    test_data = test_data / 255

    print('Test shape:', test_data.shape)
    print(test_data.shape[0], 'test samples')
    print('Read and process test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return test_data, test_id


def dict_to_list(d):
    ret = []
    for i in d.items():
        ret.append(i[1])
    return ret


def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()


# def create_model():
#     model = Sequential()
#     model.add(ZeroPadding2D((1, 1), input_shape=(3, 64, 64), dim_ordering='th'))
#     model.add(Convolution2D(16, 3, 3, activation='relu', dim_ordering='th'))
#     model.add(Dropout(0.2))
#     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))    
#     model.add(Convolution2D(16, 3, 3, activation='relu', dim_ordering='th'))
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))
#     model.add(Dropout(0.2))

# #     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
# #     model.add(Convolution2D(8, 3, 3, activation='relu', dim_ordering='th'))
# #     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
# #     model.add(Convolution2D(8, 3, 3, activation='relu', dim_ordering='th'))
# #     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))
# #     model.add(Dropout(0.2))

#     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
#     model.add(Convolution2D(32, 3, 3, activation='relu', dim_ordering='th'))
#     model.add(Dropout(0.2))
#     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
#     model.add(Convolution2D(32, 3, 3, activation='relu', dim_ordering='th'))
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))
#     model.add(Dropout(0.2))

#     model.add(Flatten())
#     model.add(Dense(108, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(36, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(8, activation='softmax'))

#     sgd = SGD(lr=1e-2, decay=1e-6, momentum=0.89, nesterov=True)
#     model.compile(optimizer=sgd, loss='categorical_crossentropy')

#     return model

# optimizer = RMSprop(lr=1e-4)

def create_model():
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(3, 64, 64), dim_ordering='th'))
    model.add(Convolution2D(8, 3, 3, activation='relu', dim_ordering='th', init='he_uniform'))
    model.add(Dropout(0.1))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))
    
    model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
    model.add(Convolution2D(16, 3, 3, activation='relu', dim_ordering='th', init='he_uniform'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))
    model.add(Dropout(0.25))

#     model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
#     model.add(Convolution2D(32, 3, 3, activation='relu', dim_ordering='th', init='he_uniform'))
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))
#     model.add(Dropout(0.2))
    
    model.add(Flatten())
    model.add(Dense(200, activation='relu',init='he_uniform'))
    model.add(Dropout(0.5))
    model.add(Dense(40, activation='relu',init='he_uniform',W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))
    model.add(Dropout(0.25))
    model.add(Dense(8, activation='softmax'))

    sgd = SGD(lr=1e-2, decay=1e-4, momentum=0.89, nesterov=False)
    model.compile(optimizer=sgd, loss='categorical_crossentropy')

    return model

# model.add(Dense(64, input_dim=64, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))

def get_validation_predictions(train_data, predictions_valid):
    pv = []
    for i in range(len(train_data)):
        pv.append(predictions_valid[i])
    return pv


def run_cross_validation_create_models(nfolds=10):
    # input image dimensions
    batch_size = 32
    nb_epoch = 45
    random_state = 51

    train_data, train_target, train_id = read_and_normalize_train_data()

    yfull_train = dict()
#     kf = KFold(len(train_id), n_folds=nfolds, shuffle=True, random_state=random_state)
    kf = StratifiedKFold(y_all,n_folds =nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    sum_score = 0
    models = []
    for train_index, test_index in kf:
        model = create_model()
        X_train = train_data[train_index]
        Y_train = train_target[train_index]
        X_valid = train_data[test_index]
        Y_valid = train_target[test_index]

#         X_train, X_valid, Y_train, Y_valid = train_test_split(train_data, train_target, 
#                                                     test_size=0.25, random_state=23, 
#                                                     stratify=train_target)

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=3, verbose=0),
        ]
        model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              shuffle=True, verbose=2, validation_data=(X_valid, Y_valid),
              callbacks=callbacks)

        predictions_valid = model.predict(X_valid.astype('float32'), batch_size=batch_size, verbose=2)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score*len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        models.append(model)

    score = sum_score/len(train_data)
    print("Log_loss train independent avg: ", score)

    info_string = 'loss_' + str(score) + '_folds_' + str(nfolds) + '_ep_' + str(nb_epoch)
    return info_string, models


def run_cross_validation_process_test(info_string, models):
    batch_size = 30
    num_fold = 0
    yfull_test = []
    test_id = []
    nfolds = len(models)

    for i in range(nfolds):
        model = models[i]
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        test_data, test_id = read_and_normalize_test_data()
        test_prediction = model.predict(test_data, batch_size=batch_size, verbose=2)
        yfull_test.append(test_prediction)

    test_res = merge_several_folds_mean(yfull_test, nfolds)
    info_string = 'loss_' + info_string \
                + '_folds_' + str(nfolds)
    create_submission(test_res, test_id, info_string)


if __name__ == '__main__':
    print('Keras version: {}'.format(keras_version))
    num_folds = 5
    info_string, models = run_cross_validation_create_models(num_folds)
    run_cross_validation_process_test(info_string, models)

Keras version: 1.1.2
Read train images
Load folder ALB (Index: 0)
Load folder BET (Index: 1)
Load folder DOL (Index: 2)
Load folder LAG (Index: 3)
Load folder NoF (Index: 4)
Load folder OTHER (Index: 5)
Load folder SHARK (Index: 6)
Load folder YFT (Index: 7)
Read train data time: 71.36 seconds
Convert to numpy...
Reshape...
Convert to float...
('Train shape:', (3777, 3, 64, 64))
(3777, 'train samples')


NameError: global name 'y_all' is not defined

In [None]:
ValueError: Shape mismatch: x has 2048 cols (and 16 rows) but y has 25088 rows (and 4096 cols)
Apply node that caused the error: Dot22(Reshape{2}.0, dense_5_W)
Toposort index: 203
Inputs types: [TensorType(float32, matrix), TensorType(float32, matrix)]
Inputs shapes: [(16, 2048), (25088, 4096)]
Inputs strides: [(8192, 4), (16384, 4)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Elemwise{add,no_inplace}(Dot22.0, InplaceDimShuffle{x,0}.0), 
                   Elemwise{Composite{Switch(i0, (Composite{(Abs(i0) + i1 + i2)}(i1, i2, i3) * i4), 
                                             (i5 * Composite{(Abs(i0) + i1 + i2)}(i1, i2, i3)))}}[(0, 2)]
                   (InplaceDimShuffle{x,x}.0, Elemwise{add,no_inplace}.0, Dot22.0, InplaceDimShuffle{x,0}.0, 
                    Elemwise{Composite{Cast{float32}(LT(i0, i1))}}[(0, 0)].0, TensorConstant{(1, 1) of 0.5})]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. 
    This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, 
    Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.