In [329]:
# Inpirations
# 1. https://www.kaggle.com/lextoumbourou/humpback-whale-id-data-and-aug-exploration
# 2. https://www.kaggle.com/sunnybeta322/what-am-i-whale-let-me-tell-you
# 3. https://www.kaggle.com/mmrosenb/whales-some-image-processing
# 4. https://www.kaggle.com/orangutan/keras-vgg19-starter
# 5. https://www.kaggle.com/gimunu/data-augmentation-with-keras-into-cnn/notebook

# Datasets
# You msut download the datasets from https://www.kaggle.com/c/whale-categorization-playground/data
# And locate them under "inputs" folder
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

import os
from tqdm import tqdm
from sklearn import preprocessing
import cv2
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

## Initiating Hyper Params

In [318]:
pool_size = (2,2)
learning_rate = 0.07
num_of_epochs = 10
im_size = 128
training_dir = '../inputs/train/'
testing_dir = '../inputs/test/'
train_csv_path = '../inputs/train.csv'
batch_size = 64 # Reduce the batch size if you run out of GPU memory
epoch = 10
k_size = (4,4)
drop_probability = 0.5
hidden_size = 256
batch_size = 64
input_shape = (im_size, im_size)
X_train_check = './X_train.npy'
y_train_check = './y_train.npy'
X_test_check = './X_test.npy'
y_train_onehot_check = './y_train_onehot.npy'

# Utils
def process_image(file_dir, name):
    fname = '{fdir}{fname}'.format(fdir=file_dir, fname=name)
    x = cv2.imread(fname)
    resized_x = cv2.resize(x, (im_size, im_size))
    return resized_x / 255

## Code

In [324]:
# Placeholders
X_train = []
y_train = []
X_test = []

try:
    X_train = np.load(X_train_check)
    y_train = np.load(y_train_check)
    X_test = np.load(X_test_check)
except:
    print('Loading data from scratch')
    df_train = pd.read_csv(train_csv_path)
    # Loading training and label data
    for file, label in tqdm(df_train.values):
        img = process_image(training_dir, file)
        X_train.append(img)
        y_train.append(label)

    # Loading testing data
    test_files = []
    for root, dirs, files in os.walk(testing_dir):  
        for filename in files:
            test_files.append(filename)

    for file in tqdm(test_files):
        img = process_image(testing_dir, file)
        X_test.append(img)

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)

    np.save(X_train_check, X_train)
    np.save(y_train_check, y_train)
    np.save(X_test_check, X_test)

In [325]:
# Checking shapes
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(9850, 128, 128, 3)
(9850,)
(15610, 128, 128, 3)


In [326]:
try:
    y_train = np.load(y_train_onehot_check)
except:
    # Label one hot encoder
    one_hot = pd.get_dummies(y_train, sparse = True)
    one_hot_labels = np.asarray(one_hot)
    np.save(y_train_onehot_check, one_hot_labels)
    y_train = one_hot_labels
print(y_train.shape)
print(y_train[:5])

(9850, 4251)
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [294]:
y_train_raw = np.array(y_train, np.uint8)
x_train_raw = np.array(X_train, np.float32)
x_test  = np.array(X_test, np.float32)

In [295]:
print(y_train_raw.shape)
print(x_train_raw.shape)
print(x_test.shape)

(9850, 4251)
(9850, 128, 128, 3)
(15610, 128, 128, 3)


In [298]:
num_classes = y_train_raw.shape[1]
print(num_classes)

4251


In [327]:
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_raw, y_train_raw, test_size=0.3, random_state=1)

In [None]:
model = Sequential()
input_shape = X_train_raw.shape[1:]
print('checking input_shape ', input_shape)
model.add(Convolution2D(32, kernel_size=k_size, activation="relu", input_shape=X_train_raw.shape[1:]))
model.add(MaxPooling2D(pool_size=pool_size, strides=(2,2)))
model.add(Convolution2D(64, kernel_size=k_size, activation="relu"))
model.add(MaxPooling2D(pool_size=pool_size, strides=(1,1)))
model.add(Convolution2D(512, kernel_size=k_size, activation="relu"))
model.add(MaxPooling2D(pool_size=pool_size, strides=(2,2)))
model.add(Flatten())
model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(512, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(num_classes, activation="softmax"))

# COST AND OPTIMIZER
model.compile(loss=categorical_crossentropy,
              optimizer=Adam(lr=0.01),
              metrics=['accuracy'])

model.summary()

('checking input_shape ', (128, 128, 3))
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_55 (Conv2D)           (None, 125, 125, 32)      1568      
_________________________________________________________________
max_pooling2d_50 (MaxPooling (None, 62, 62, 32)        0         
_________________________________________________________________
conv2d_56 (Conv2D)           (None, 59, 59, 64)        32832     
_________________________________________________________________
max_pooling2d_51 (MaxPooling (None, 58, 58, 64)        0         
_________________________________________________________________
conv2d_57 (Conv2D)           (None, 55, 55, 512)       524800    
_________________________________________________________________
max_pooling2d_52 (MaxPooling (None, 27, 27, 512)       0         
_________________________________________________________________
flatten_33 (Flatten)         (None,

In [328]:
model.fit(
    X_train,
    Y_train,
    validation_data=(X_valid, Y_valid),
    batch_size=batch_size,
    epochs=num_of_epochs,
    verbose=1)

Train on 6895 samples, validate on 2955 samples
Epoch 1/10


KeyboardInterrupt: 

In [None]:
# Evaluate
scores = model.evaluate(X_valid, Y_valid, verbose=1)
print('Accuracy: {}'.format(scores[1] * 100))