In [1]:
import json
import threading
import json
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from skimage.io import imread
from skimage.transform import resize

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score

from keras.utils.data_utils import Sequence
from keras.callbacks import ModelCheckpoint   
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, GlobalAveragePooling2D, GlobalMaxPooling2D, MaxPooling2D
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input, decode_predictions

DATA_DIR = "../input/"
NUM_CLASSES = 228
IMAGE_SIZE = 128

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
%%time
with open(DATA_DIR + "train.json") as train, open(DATA_DIR + "test.json") as test, open(DATA_DIR + "validation.json") as validation:
    train_json = json.load(train)
    test_json = json.load(test)
    validation_json = json.load(validation)
    
train_paths = ["../input/train/{}.jpg".format(obj['imageId']) for obj in train_json['images']]
test_paths = ["../input/test/{}.jpg".format(obj['imageId']) for obj in test_json['images']]
validation_paths = ["../input/validation/{}.jpg".format(obj['imageId']) for obj in validation_json['images']]

def generate_label_array(json_obj):
    result = []
    for data in json_obj['annotations']:
        temp_array = [0] * NUM_CLASSES
        for elem in data['labelId']:
            temp_array[int(elem) - 1] = 1
        result.append(temp_array)
    return np.array(result)

train_labels = generate_label_array(train_json)
validation_labels = generate_label_array(validation_json)

Wall time: 18.2 s


In [3]:
train_paths = train_paths[:1000]
validation_paths = validation_paths[:1000]

train_labels = train_labels[:1000]
validation_labels = validation_labels[:1000]

In [4]:
class BatchSequence(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]

        return np.array([
            resize(imread(file_name), (IMAGE_SIZE, IMAGE_SIZE))
               for file_name in batch_x]), np.array(batch_y)


In [5]:
conv_base = VGG16(
    weights='imagenet',
    input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3),
    include_top = False,
    classes = NUM_CLASSES
)

model = Sequential()
model.add(conv_base)
model.add(GlobalMaxPooling2D())
model.add(Dropout(0.3))
model.add(Dense(30, activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(NUM_CLASSES, activation='softmax'))
conv_base.trainable = False
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 4, 4, 512)         14714688  
_________________________________________________________________
global_max_pooling2d_1 (Glob (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                15390     
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 228)               7068      
Total params: 14,737,146
Trainable params: 22,458
Non-trainable params: 14,714,688
___________________________________________________________

In [None]:
EPOCHS = 5
BATCH = 32
STEPS = len(train_paths) // BATCH
VAL_STEPS = len(validation_paths) // BATCH

train_gen = BatchSequence(train_paths, train_labels, BATCH)
val_gen = BatchSequence(validation_paths, validation_labels, BATCH)


model.compile(
    loss='categorical_crossentropy', 
    optimizer='rmsprop', 
    metrics=['accuracy']
)

checkpointer = ModelCheckpoint(
    filepath='model.best.hdf5', 
    verbose=1,
    save_best_only=True
)

history = model.fit_generator(
    generator = train_gen,
    validation_data = val_gen,
    epochs = EPOCHS,
    steps_per_epoch = STEPS,
    callbacks = [checkpointer],
)

Epoch 1/5


  warn("The default mode, 'constant', will be changed to 'reflect' in "



Epoch 00001: val_loss improved from inf to 36.64746, saving model to model.best.hdf5
Epoch 2/5

Epoch 00002: val_loss improved from 36.64746 to 35.54256, saving model to model.best.hdf5
Epoch 3/5

Epoch 00003: val_loss improved from 35.54256 to 35.26704, saving model to model.best.hdf5
Epoch 4/5

Epoch 00004: val_loss improved from 35.26704 to 35.08957, saving model to model.best.hdf5
Epoch 5/5
 5/31 [===>..........................] - ETA: 1:20 - loss: 23.4729 - acc: 0.1437