In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
from glob import glob
from PIL import Image
import matplotlib.pylab as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator

## Importing the data

In [9]:
train_images = glob("../../.kaggle/competitions/whale-categorization-playground/train/*jpg")
test_images = glob("../../.kaggle/competitions/whale-categorization-playground/test/*jpg")
df = pd.read_csv("~/.kaggle/competitions/whale-categorization-playground/train.csv")

df["Image"] = df["Image"].map( lambda x : "../input/train/"+x)
ImageToLabelDict = dict( zip( df["Image"], df["Id"]))

In [10]:
SIZE = 64
#image are imported with a resizing and a black and white conversion
def ImportImage( filename):
    img = Image.open(filename).convert("LA").resize( (SIZE,SIZE))
    return np.array(img)[:,:,0]
train_img = np.array([ImportImage( img) for img in train_images])
x = train_img

### Training sample distribution

In [11]:
print( "%d training images" %x.shape[0])

print( "Nbr of samples/class\tNbr of classes")
for index, val in df["Id"].value_counts().value_counts().sort_index().iteritems():
    print( "%d\t\t\t%d" %(index,val))

9850 training images
Nbr of samples/class	Nbr of classes
1			2220
2			1034
3			492
4			192
5			102
6			61
7			40
8			23
9			21
10			9
11			7
12			7
13			9
14			5
15			4
16			5
17			4
18			2
19			2
20			1
21			3
22			3
23			1
26			1
27			1
34			1
810			1


We read that the classes are very unbalanced: one class has ~800 samples while ~2000 have only one example in the training set. This calls for a lot of data augmentation. 

#### One hot encoding on the labels

Using a composition of a LabelEncoder and OneHotEncoder to one hot encode the target tail kinds.

In [13]:
class LabelOneHotEncoder():
    def __init__(self):
        self.ohe = OneHotEncoder()
        self.le = LabelEncoder()
    def fit_transform(self, x):
        features = self.le.fit_transform( x)
        return self.ohe.fit_transform( features.reshape(-1,1))
    def transform( self, x):
        return self.ohe.transform( self.la.transform( x.reshape(-1,1)))
    def inverse_tranform( self, x):
        return self.le.inverse_transform( self.ohe.inverse_tranform( x))
    def inverse_labels( self, x):
        return self.le.inverse_transform( x)

y = list(map(ImageToLabelDict.get, train_images))
lohe = LabelOneHotEncoder()
y_cat = lohe.fit_transform(y)

In [17]:
train_images

['../../.kaggle/competitions/whale-categorization-playground/train/42cf9ddf.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/f80cf403.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/e5dfd9ab.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/8bb4b9a9.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/fc5a9810.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/773436f1.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/01e25788.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/e22715b2.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/90befe0d.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/7849d58b.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/b987724d.jpg',
 '../../.kaggle/competitions/whale-categorization-playground/train/fc1882b1.jpg',
 '../../.kaggle/

In [14]:
#constructing class weights
WeightFunction = lambda x : 1./x**0.75
ClassLabel2Index = lambda x : lohe.le.inverse_tranform( [[x]])
CountDict = dict( df["Id"].value_counts())
class_weight_dic = { lohe.le.transform( [image_name])[0] : WeightFunction(count) for image_name, count in CountDict.items()}
del CountDict

ValueError: y contains new labels: ['w_f78c287']

#### Some visualization

In [33]:
def plotImages( images_arr, n_images=4):
    fig, axes = plt.subplots(n_images, n_images, figsize=(12,12))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        if img.ndim != 2:
            img = img.reshape( (SIZE,SIZE))
        ax.imshow( img, cmap="Greys_r")
        ax.set_xticks(())
        ax.set_yticks(())
    plt.tight_layout()

In [34]:
#plotting training images from training set after resizing and BW conversion
plotImages( x)

### Image augmentation with Keras prebuilt

In [35]:
#use of an image generator for preprocessing and data augmentation
x = x.reshape( (-1,SIZE,SIZE,1))
input_shape = x[0].shape
x_train = x.astype("float32")
y_train = y_cat

image_gen = ImageDataGenerator(
    #featurewise_center=True,
    #featurewise_std_normalization=True,
    rescale=1./255,
    rotation_range=15,
    width_shift_range=.15,
    height_shift_range=.15,
    horizontal_flip=True)

#training the image preprocessing
image_gen.fit(x_train, augment=True)

#visualization of some images out of the preprocessing
#augmented_images, _ = next( image_gen.flow( x_train, y_train.toarray(), batch_size=4*4))
#plotImages( augmented_images)

## Building and training model

In [36]:
batch_size = 128
num_classes = len(y_cat.toarray()[0])
epochs = 9

print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')

model = Sequential()
model.add(Conv2D(48, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(48, (3, 3), activation='sigmoid'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(48, (5, 5), activation='sigmoid'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Dropout(0.33))
model.add(Flatten())
model.add(Dense(36, activation='sigmoid'))
model.add(Dropout(0.33))
model.add(Dense(36, activation='sigmoid'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])
model.summary()
model.fit_generator(image_gen.flow(x_train, y_train.toarray(), batch_size=batch_size),
          steps_per_epoch=  x_train.shape[0]//batch_size,
          epochs=epochs,
          verbose=1,
          class_weight=class_weight_dic)

#score = model.evaluate(x_train, y_train, verbose=0)
#print('Training loss: {0:.4f}\nTraining accuracy:  {1:.4f}'.format(*score))

('x_train shape:', (9850, 64, 64, 1))
(9850, 'train samples')
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 62, 62, 48)        480       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 60, 60, 48)        20784     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 30, 30, 48)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 26, 26, 48)        57648     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 8, 8, 48)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 8, 8, 48)          0         
_________________________________________________________________
flatten_1 (Fla

NameError: name 'class_weight_dic' is not defined

### Predictions on test samples and export for submission

In [None]:
import warnings
from os.path import split

with open("sample_submission.csv","w") as f:
    with warnings.catch_warnings():
        f.write("Image,Id\n")
        warnings.filterwarnings("ignore",category=DeprecationWarning)
        for image in test_images:
            img = ImportImage( image)
            x = img.astype( "float32")
            #applying preprocessing to test images
            x = image_gen.standardize( x.reshape(1,SIZE,SIZE))
            
            y = model.predict_proba(x.reshape(1,SIZE,SIZE,1))
            predicted_args = np.argsort(y)[0][::-1][:5]
            predicted_tags = lohe.inverse_labels( predicted_args)
            image = split(image)[-1]
            predicted_tags = " ".join( predicted_tags)
            f.write("%s,%s\n" %(image, predicted_tags))