In [2]:
import keras
keras.__version__

Using TensorFlow backend.


'2.1.2'

Data downloaded from: https://www.kaggle.com/c/whale-categorization-playground

I renamed train.csv -> targets.csv and the train directory to kaggle_train/

In [3]:
import os

home_dir = os.getcwd()
fname = os.path.join(home_dir, 'targets.csv') # targets for both train and validation

f = open(fname)
data = f.read()
f.close()

lines = data.split('\n')
header = lines[0].split(',')
lines = lines[1:]
lines = lines[:-1]

print(header)
print(len(lines))

['Image', 'Id']
9850


# Encoding the whale ids


In [4]:
import numpy as np
whale_ids = [line.split(',')[1] for line in lines]
whale_ids = set(whale_ids) # convert to set to remove duplicats
whale_ids = list(whale_ids) # convert back to list to make it ordered

whale_dict = {}
for i, whale in enumerate(whale_ids):
    vec = np.zeros(len(whale_ids))
    vec[i] = 1
    whale_dict[whale] = vec
    
def whale2vec(whale): # returns a unique one-hot encoded vector given 
    if whale in whale_dict.keys():
        return whale_dict[whale]
    else:
        print("whale not found. Returning new_whale vector")
        return whale_dict['new_whale']
               
vec = whale2vec(whale_ids[2])
print(vec)

[ 0.  0.  1. ...,  0.  0.  0.]


# Preprocessing images

In [5]:
import errno 

def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

First we split the training images into a train and validation set, and then subdivide them into directories for each whale_id

In [None]:
import shutil
original_dataset_dir = os.path.join(home_dir, 'kaggle_train')

train_dir = os.path.join(home_dir, 'train')
mkdir_p(train_dir)

validation_dir = os.path.join(home_dir, 'validation')
mkdir_p(validation_dir)

# Let's then create a subdirectory for each whale_id in both the train and validation directories 
# so we can use the ImageDataGenerator magic function

for whale_id in whale_ids:
    mkdir_p(os.path.join(train_dir, whale_id))
    mkdir_p(os.path.join(validation_dir, whale_id))
    
# Copy first 7500 files into the appropriate whale directory in train dir    
for i in range(7500):
    pic = lines[i].split(',')[0]
    whale_id = lines[i].split(',')[1]
    src = os.path.join(original_dataset_dir, pic)
    whale_id_dir = os.path.join(train_dir, whale_id)
    dst = os.path.join(whale_id_dir, pic)
    shutil.copyfile(src, dst)
    
# copy the rest into the appropriate whale directory in validation dir    
for i in range(7500,len(lines)):
    pic = lines[i].split(',')[0]
    whale_id = lines[i].split(',')[1]
    src = os.path.join(original_dataset_dir, pic)
    whale_id_dir = os.path.join(validation_dir, whale_id)
    dst = os.path.join(whale_id_dir, pic)
    shutil.copyfile(src, dst)


In [22]:
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)
image_size = (100,100)

train_generator = train_datagen.flow_from_directory(
        # This is the target directory
        train_dir,
        # All images will be resized to image_size - define above
        target_size=image_size,
        color_mode='grayscale',
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='categorical')

validation_generator = train_datagen.flow_from_directory(
        # This is the target directory
        validation_dir,
        # All images will be resized to image_size - define above
        target_size=image_size,
        color_mode='grayscale',
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='categorical')

Found 7500 images belonging to 4251 classes.
Found 2350 images belonging to 4251 classes.


In [28]:
from keras import layers
from keras import models

model = models.Sequential()
model.add(layers.Conv2D(32,(3,3),activation='relu',input_shape=(image_size[0],image_size[1],1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(len(whale_ids), activation='softmax'))


In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_11 (Conv2D)           (None, 98, 98, 32)        320       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 49, 49, 32)        0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 47, 47, 64)        18496     
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 23, 23, 64)        0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 21, 21, 128)       73856     
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 10, 10, 128)       0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 8, 8, 128)         147584    
__________

In [30]:
from keras import optimizers

model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])

In [None]:
history = model.fit_generator(
      train_generator,
      steps_per_epoch=375, # with a batch size of 20, 375 steps will get us through the 7500 images
      epochs=20,
      validation_data=validation_generator,
      validation_steps=50)

Epoch 1/20
Epoch 2/20