# Prepare data

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

np.random.seed(100)

df = pd.read_csv("train.csv", header = 0)
display(df.std())

total_images = len(df)

# drop rows with new_whale because it is used to label various unknown flukes yet
df = df[df.Id != 'new_whale']

# use targets with 3+ samples
df = df.groupby("Id").filter(lambda x: len(x) >= 3)

# Use top N targets by image count
value_counts = df.Id.value_counts()
top_hitters = value_counts.nlargest(20).index
df = df[df['Id'].isin(top_hitters)]

fig, ax = plt.subplots()
ax.set_yscale('log')
ax.set_ylabel("Whales")
ax.set_xlabel("Images per whale")
value_counts.hist(ax=ax,figsize=(20,5),bins=10, bottom=1)

classes = df.Id.unique()
num_classes = len(classes)

X_train = []; y_train = []

X_test = []; y_test = []

X_valid = []; y_valid = []

for whale_id in classes:
    df_whale = df[df.Id == whale_id]
    
    X_whale = np.array([os.path.join(os.getcwd(), 'train', s) for s in df_whale.Image])
    y_whale = np.array(df_whale.Id.values)
    
    X_train_whale, X_test_whale, y_train_whale, y_test_whale = \
        train_test_split(X_whale, y_whale, test_size=0.2, random_state=1)
    X_test.extend(X_test_whale)
    y_test.extend(y_test_whale)
    
    X_train_whale, X_valid_whale, y_train_whale, y_valid_whale = \
        train_test_split(X_train_whale, y_train_whale, test_size=0.2, random_state=1)
    X_train.extend(X_train_whale)
    y_train.extend(y_train_whale)
    X_valid.extend(X_valid_whale)
    y_valid.extend(y_valid_whale)

print('\nThere are %d total images.' % total_images)
print("Trainable...")
print('There are %d total classes.' % num_classes)

print('There are %d training images.' % len(X_train))
print('There are %d validation images.' % len(X_valid))
print('There are %d test images.' % len(X_test))

print("After data filtering")

df.describe()

workers = 8

# Visualize the 12 random training Images

In [None]:
import cv2
import matplotlib.pyplot as plt
import random
%matplotlib inline

def visualize_img(img_path, ax):
    img = cv2.imread(img_path)
    ax.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    
fig = plt.figure(figsize=(50, 30))

num_images = 12
rand_images = random.sample(X_train, num_images)
for i in range(num_images):
    ax = fig.add_subplot(3, 4, i + 1, xticks=[], yticks=[])
    visualize_img(rand_images[i], ax)

# Prepare Dataset for ImageDataGenerator

In [None]:
import shutil
import pathlib
import os

def copy_class_of_files(files, dst, labels):
    for idx, val in enumerate(files):
        dst_dir = os.path.join(dst, labels[idx])
        pathlib.Path(dst_dir).mkdir(parents=True, exist_ok=True)
        shutil.copy(val, dst_dir)
        
shutil.rmtree('./data', ignore_errors=True)
copy_class_of_files(X_train, 'data/train', y_train)
copy_class_of_files(X_valid, 'data/valid', y_valid)
copy_class_of_files(X_test, 'data/test', y_test)

# Load CNN without top layer

In [None]:
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.vgg16 import VGG16

# define InceptionResNetV2 model
image_side_size = 299
image_dim = (image_side_size, image_side_size, 3)
base_model = InceptionResNetV2(weights='imagenet', include_top=False,
                          input_tensor=None, input_shape = image_dim, 
                          pooling=None)

# freeze weights in all layers of the base model
for layer in base_model.layers:
    layer.trainable = False
                      
# Train several last layers in base model
# for layer in base_model.layers[-3:]:
#     layer.trainable = True
    
#base_model.summary()

# Define training and validation image generators

In [None]:
from keras.preprocessing.image import ImageDataGenerator

batch_size = 1
target_size = (image_side_size, image_side_size)

train_datagen = ImageDataGenerator(
        rescale=1./255,
        horizontal_flip=False,
        vertical_flip=True,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        zoom_range=0.2,
        shear_range=0.2,
        fill_mode='nearest')

valid_datagen = ImageDataGenerator(rescale=1./255)

print("Train generator")
train_generator = train_datagen.flow_from_directory(
        'data/train',
        target_size = target_size,
        batch_size = batch_size,
        class_mode=None,
        shuffle=False)

print("Valid generator")
valid_generator = valid_datagen.flow_from_directory(
        'data/valid',
        target_size = target_size,
        class_mode = None,
        batch_size = batch_size,
        shuffle=False)

# Record bottleneck features of the base pre-trained model
https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html

In [None]:
bottleneck_features_train = base_model.predict_generator(
    train_generator, 
    workers=workers, verbose=1, 
    steps=train_generator.samples//batch_size)

# save the output as a Numpy array
np.save('bottleneck_features_train.npy', bottleneck_features_train)

bottleneck_features_validation = base_model.predict_generator(
    valid_generator,
    workers=workers, verbose=1, 
    steps=valid_generator.samples//batch_size)

np.save('bottleneck_features_validation.npy', bottleneck_features_validation)

# Train top level dense layers

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential, Model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import utils as np_utils

train_data = np.load('bottleneck_features_train.npy')
validation_data = np.load('bottleneck_features_validation.npy')

top_model = Sequential()
top_model.add(Flatten(input_shape=train_data.shape[1:]))
top_model.add(Dense(256, activation='relu'))
top_model.add(Dropout(0.5))
top_model.add(Dense(num_classes, activation='softmax'))

top_model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
#model.summary()

from keras.callbacks import ModelCheckpoint, EarlyStopping

best_weights_path='whale.flukes.weights.best.hdf5'

checkpointer = ModelCheckpoint(filepath=best_weights_path, verbose=1, save_best_only=True)

# Stop the training if the model shows no improvement 
stopper = EarlyStopping(monitor='val_loss', min_delta=0.005, patience=25, verbose=1, mode='auto')

label_encoder = LabelEncoder()

top_model.fit(x=train_data,  
          y=np_utils.to_categorical(label_encoder.fit_transform(y_train)),
          epochs=100,
          batch_size=32,
          callbacks=[checkpointer, stopper],
          validation_data=(validation_data, 
                           np_utils.to_categorical(
                               label_encoder.fit_transform(y_valid))))

# Tune final model

In [None]:
from keras import optimizers

top_model.load_weights(best_weights_path)

model = Model(input=base_model.input, 
              output=top_model(base_model.output))

layers_to_freeze = len(model.layers) - len(top_model.layers)

print(f"Freezing {layers_to_freeze} layers")

for layer in model.layers[:layers_to_freeze]:
    layer.trainable = False
for layer in model.layers[layers_to_freeze:]:
    layer.trainable = True
    
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              metrics=['accuracy'])
#model.summary()

print("Traning dataset...")
train_generator = train_datagen.flow_from_directory(
        'data/train',
        target_size = target_size,
        batch_size = batch_size,
        class_mode="categorical",
        shuffle=False)

print("Validation dataset...")
valid_generator = valid_datagen.flow_from_directory(
        'data/valid',
        target_size = target_size,
        class_mode = "categorical",
        batch_size = batch_size,
        shuffle=False)

batch_size = 32

tuned_checkpointer = ModelCheckpoint(filepath='whale.flukes.weights.tuned.hdf5', 
                                     verbose=1, 
                                     save_best_only=True)

stopper = EarlyStopping(monitor='val_loss', 
                        min_delta=0.005, 
                        patience=20, 
                        verbose=1, 
                        mode='auto')

model.fit_generator(generator=train_generator,
                    epochs=50,
                    steps_per_epoch = int(len(X_train)//batch_size),
                    validation_data=valid_generator, 
                    validation_steps = int(len(X_valid)//batch_size),
                    callbacks=[tuned_checkpointer, stopper],
                    verbose=1, workers=workers)


# Calculate Classification Accuracy on Test Set

In [None]:
from keras.models import load_model

model = load_model('whale.flukes.weights.tuned.hdf5')
#model.summary()

test_datagen = ImageDataGenerator(rescale=1. / 255)

test_generator = test_datagen.flow_from_directory(
    "data/test",
    target_size=target_size,
    batch_size=batch_size,
    class_mode='categorical', shuffle=False)

score = model.evaluate_generator(
    generator=test_generator, 
    #steps=len(test_generator.classes)//min(len(test_generator.classes),32),
    workers=workers)

print("Loss: ", score[0], "Accuracy: ", score[1])

test_generator = test_datagen.flow_from_directory(
    "data/test",
    target_size=target_size,
    batch_size=batch_size,
    class_mode='categorical', shuffle=False)

scores = model.predict_generator(
    generator=test_generator, 
    workers=workers,
    batch_size=batch_size,
    verbose=1)

# Print confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

print(len(list(map(lambda x: np.argmax(x), scores))))

cnf_matrix = confusion_matrix(test_generator.classes, list(map(lambda x: np.argmax(x), scores)))
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(70,10))
plot_confusion_matrix(cnf_matrix, classes=classes, title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(70,10))
plot_confusion_matrix(cnf_matrix, classes=classes, normalize=True, title='Normalized confusion matrix')

plt.show()

# Write classification result

In [None]:
import csv
import ntpath

with open('predictions.csv', 'w') as csvfile:
    fieldnames = ['Image', 'Id']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for idx, file_name in enumerate(X_test):
        probs = scores[idx]
        _, sorted_classes = zip(*sorted(zip(probs, classes), reverse=True))
        writer.writerow(dict(zip(fieldnames, [ntpath.basename(file_name), ' '.join(sorted_classes[:3])])))