# Humpback Whale Identification 

This kernel contains our first draft solution to the [kaggle competition](https://www.kaggle.com/c/humpback-whale-identification). A few things should be taken care so that the notebook can properly operate.

In [2]:
# General Imports
import pandas as pd 
import numpy as np 
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Keras Imports
from keras import layers
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout
from keras.models import Model
import keras.backend as K
from keras.models import Sequential

# Remove Warnings
import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

Using TensorFlow backend.


In [3]:
os.listdir("Data")

['all.zip', 'sample_submission.csv', 'Test', 'Train', 'train.csv']

In [4]:
train_df = pd.read_csv("Data/train.csv")
train_df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [64]:
print('In total, we have {} training images that correspond to {} different classes'
      .format(len(train_df), len(train_df['Id'].unique())))

print("The class that contains most pictures is '{}', and {} observations belong to this class."
      .format(train_df.groupby('Id').size().idxmax(), train_df.groupby('Id').size().max()))

print('{} classes have only one observation.'
      .format(len(train_df.groupby('Id').count()[train_df.groupby('Id').count()['Image']==1])))

In total, we have 25361 training images that correspond to 5005 different classes
The class that contains most pictures is 'new_whale', and 9664 observations belong to this class.
2073 classes have only one observation.


We observe that the dataset is very unbalanced since there are 5005 classes and 38.1% of the training images belong to the class called 'new_whale'. 

In [82]:
def prepareImages(data, m, dataset):
    # This function is inspired by this notebook: https://www.kaggle.com/pestipeti/keras-cnn-starter
    # and it is slightly modified
    
    print("Preparing images")
    X_train = np.zeros((m, 100, 100, 3))
    count = 0
    
    for fig in data['Image']:
        #load images into images of size 100x100x3
        img = image.load_img("Data/"+dataset+"/"+fig, target_size=(100, 100, 3))
        x = image.img_to_array(img)
        x = preprocess_input(x)

        X_train[count] = x
        
        # Estimate how much time is lesft
        if (count%500 == 0):
            print("Processing image: ", count+1, ", ", fig)
        count += 1
    
    return X_train

In [83]:
def prepare_labels(y):
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    # print(integer_encoded)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    # print(onehot_encoded)

    y = onehot_encoded
    # print(y.shape)
    return y, label_encoder

In [84]:
X = prepareImages(train_df, train_df.shape[0], "train")

# Devide all images by the hisgest number of the pixel intensity.
X /= 255

Preparing images
Processing image:  1 ,  0000e88ab.jpg
Processing image:  501 ,  04c72257b.jpg
Processing image:  1001 ,  09cacb84d.jpg
Processing image:  1501 ,  0ef961892.jpg
Processing image:  2001 ,  141b56a1a.jpg
Processing image:  2501 ,  199a417aa.jpg
Processing image:  3001 ,  1ec170983.jpg
Processing image:  3501 ,  23f084b93.jpg
Processing image:  4001 ,  29163ad0b.jpg
Processing image:  4501 ,  2e0fab120.jpg
Processing image:  5001 ,  3347515d9.jpg
Processing image:  5501 ,  3842d71dc.jpg
Processing image:  6001 ,  3d7f4c7d5.jpg
Processing image:  6501 ,  425f763ca.jpg
Processing image:  7001 ,  4714400cd.jpg
Processing image:  7501 ,  4c082fbdf.jpg
Processing image:  8001 ,  50c683e23.jpg
Processing image:  8501 ,  560d986ad.jpg
Processing image:  9001 ,  5b68c83ed.jpg
Processing image:  9501 ,  60410f111.jpg
Processing image:  10001 ,  654951f81.jpg
Processing image:  10501 ,  6a572256c.jpg
Processing image:  11001 ,  6f96f55b6.jpg
Processing image:  11501 ,  74da2b511.jpg

In [85]:
y, label_encoder = prepare_labels(train_df['Id'])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [86]:
y.shape

(25361, 5005)

Below we define our baseline model to check that everything works and that the submission can be done properly.

In [87]:
# This is a baseline model to check that everything works.
# Improvements will be made in the future.
model = Sequential()

model.add(Conv2D(32, (7, 7), strides=(1, 1), name='conv0', input_shape=(100, 100, 3)))

model.add(BatchNormalization(axis=3, name='bn0'))
model.add(Activation('relu'))

model.add(MaxPooling2D((2, 2), name='max_pool'))
model.add(Conv2D(64, (3, 3), strides=(1,1), name="conv1"))
model.add(Activation('relu'))
model.add(AveragePooling2D((3, 3), name='avg_pool'))

model.add(Flatten())
model.add(Dense(500, activation="relu", name='rl'))
model.add(Dropout(0.4))

# Last layer's lenght should correpsond to the number of classes
model.add(Dense(y.shape[1], activation='softmax', name='sm'))

model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv0 (Conv2D)               (None, 94, 94, 32)        4736      
_________________________________________________________________
bn0 (BatchNormalization)     (None, 94, 94, 32)        128       
_________________________________________________________________
activation_1 (Activation)    (None, 94, 94, 32)        0         
_________________________________________________________________
max_pool (MaxPooling2D)      (None, 47, 47, 32)        0         
_________________________________________________________________
conv1 (Conv2D)               (None, 45, 45, 64)        18496     
_________________________________________________________________
activation_2 (Activation)    (None, 45, 45, 64)        0         
_________________________________________________________________
avg_pool (AveragePooling2D)  (None, 15, 15, 64)        0         
__________

In [None]:
history = model.fit(X, y, epochs=100, batch_size=100, verbose=1)
gc.collect()

Epoch 1/100
  600/25361 [..............................] - ETA: 31:55 - loss: 7.3736 - acc: 0.3217

In [None]:
plt.plot(history.history['acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()

In [None]:
test = os.listdir("../Data/test/")
print(len(test))

In [None]:
col = ['Image']
test_df = pd.DataFrame(test, columns=col)
test_df['Id'] = ''

In [None]:
X_test = prepareImages(test_df, test_df.shape[0], "test")
X_test /= 255

In [None]:
predictions = model.predict(np.array(X_test), verbose=1)

In [None]:
for i, pred in enumerate(predictions):
    
    # The competition asks to provide five predictions per image
    test_df.loc[i, 'Id'] = ' '.join(label_encoder.inverse_transform(pred.argsort()[-5:][::-1]))

In [None]:
test_df.head(10)
test_df.to_csv('submission.csv', index=False)