In [1]:
import sys

from data_helper import ImageGenerator, ValidGenerator, get_train_matrices, get_test_matrices
from sklearn.cross_validation import train_test_split
from keras_helper import KerasModel
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
from itertools import chain

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


In [2]:
img_size = (224, 224)
img_channels = 3
output_size = 17
batch_size = 128
batches = 850 * 128 // batch_size

In [3]:
X_train, y_train, labels_map = get_train_matrices("../data/train_v2.csv", "../data/train-jpg", img_size)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21)
validation_generator = ValidGenerator().get_valid_generator(X_valid, y_valid, batch_size=batch_size)
generator = ImageGenerator()
flow = generator.get_train_generator(X_train, y_train, batch_size=batch_size)
gc.collect()

100%|██████████| 40479/40479 [02:08<00:00, 315.08it/s]


7

In [None]:
from tensorflow.contrib.keras.api.keras.callbacks import ModelCheckpoint

filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)

In [None]:
model = KerasModel(img_size)
pretrain = True

if pretrain:
    learn_rates = [0.001, 0.0001, 0.00001, 0.000001]
    epochs_list = [30, 20, 10, 5]
    train_losses, val_losses = [], []

    for lr, epochs in zip(learn_rates, epochs_list):
        tmp_train_losses, tmp_val_losses, fbeta_score, thresholds = model.fit(flow, epochs, lr, validation_generator, y_valid, validation_batch = batch_size, train_callbacks=[checkpoint], batches=batches)
        train_losses += tmp_train_losses
        val_losses += tmp_val_losses

Epoch 1/30
113/850 [==>...........................] - ETA: 735s - loss: 0.5222 - acc: 0.7944

In [None]:
model.load_weights("weights.best.hdf5")
print("Weights loaded")

In [None]:
plt.plot(train_losses, label='Training loss')
plt.plot(val_losses, label='Validation loss')
plt.legend();

In [None]:
fbeta_score

In [None]:
del X_train, y_train
gc.collect()

x_test, x_test_filename = get_test_matrices("../data/test-jpg", img_size)
# Predict the labels of our x_test images
predictions = model.predict(x_test)

del x_test
gc.collect()

x_test, x_test_filename_additional = get_test_matrices("../data/test-jpg-additional", img_size)
new_predictions = model.predict(x_test)

del x_test
gc.collect()

predictions = np.vstack((predictions, new_predictions))
x_test_filename = np.hstack((x_test_filename, x_test_filename_additional))
print("Predictions shape: {}\nFiles name shape: {}\n1st predictions entry:\n{}".format(predictions.shape, 
                                                                              x_test_filename.shape,
                                                                              predictions[0]))

In [None]:
fb_score, thresholds = model.get_fbeta_score(validation_generator, y_valid, batch_size)

tags_pred = np.array(predictions).T
_, axs = plt.subplots(5, 4, figsize=(15, 20))
axs = axs.ravel()

for i, tag_vals in enumerate(tags_pred):
    sns.boxplot(tag_vals, orient='v', palette='Set2', ax=axs[i]).set_title(labels_map[i])

In [None]:
predicted_labels = model.map_predictions(predictions, labels_map, thresholds)

In [None]:
tags_list = [None] * len(predicted_labels)
for i, tags in enumerate(predicted_labels):
    tags_list[i] = ' '.join(map(str, tags))

final_data = [[filename.split(".")[0], tags] for filename, tags in zip(x_test_filename, tags_list)]

In [None]:
final_df = pd.DataFrame(final_data, columns=['image_name', 'tags'])
final_df.head()

In [None]:
tags_s = pd.Series(list(chain.from_iterable(predicted_labels))).value_counts()
fig, ax = plt.subplots(figsize=(16, 8))
sns.barplot(x=tags_s, y=tags_s.index, orient='h');

In [None]:
final_df.to_csv('submission_file.csv', index=False)
model.close()