In [7]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from skimage.transform import resize

from keras.utils.data_utils import Sequence
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Flatten, Convolution2D, GlobalAveragePooling2D, GlobalMaxPooling2D, MaxPooling2D
from keras import applications
from keras import optimizers

DATA_DIR = "../data/"
NUM_CLASSES = 228
IMAGE_SIZE = 256

In [8]:
with open(DATA_DIR + "test.json") as test:
    test_json = json.load(test)
    
# test_urls = [obj['url'] for obj in test_json['images']]
test_paths = ["../data/test/{}.jpg".format(obj['imageId']) for obj in test_json['images']]
print(test_paths[:3])

['../data/test/1.jpg', '../data/test/2.jpg', '../data/test/3.jpg']


In [9]:
conv_base = applications.VGG19(weights = "imagenet", include_top=False, input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3))
for layer in conv_base.layers[:5]:
    layer.trainable = False
model = Sequential()
model.add(conv_base)
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1024, activation='relu'))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.load_weights(DATA_DIR + "weights-improvement-04-0.05.hdf5")
model.compile(
    loss = "categorical_crossentropy", 
    optimizer = optimizers.SGD(lr=0.0001, momentum=0.9), 
    metrics=["accuracy"]
)

# model = Sequential()
# model.add(Convolution2D(32, kernel_size=(3, 3),padding='same',input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)))
# model.add(Activation('relu'))
# model.add(Convolution2D(32, (3, 3)))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Dropout(0.25))

# model.add(Convolution2D(64,(3, 3), padding='same'))
# model.add(Activation('relu'))
# model.add(Convolution2D(64, (3, 3)))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Dropout(0.25))

# model.add(Flatten())
# model.add(Dense(512))
# model.add(Activation('relu'))
# model.add(Dropout(0.5))
# model.add(Dense(NUM_CLASSES, activation='sigmoid'))

# sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
# print(model.summary())


In [11]:
class TestBatchSequence(Sequence):
    def __init__(self, x_set, batch_size, resize = False):
        self.x = x_set
        self.batch_size = batch_size
        self.resize = resize

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        images = np.empty([len(batch_x), IMAGE_SIZE, IMAGE_SIZE, 3])
        for i, path in enumerate(batch_x):
            try:
                if self.resize:
                    img = Image.open(path)
                    img.thumbnail((IMAGE_SIZE, IMAGE_SIZE))
                    image = np.array(img)
                else:
                    image = np.array(Image.open(path))
            except Exception as e:
                print(e)
                output = [1]*(IMAGE_SIZE*IMAGE_SIZE*3)
                output = np.array(output).reshape(IMAGE_SIZE,IMAGE_SIZE,3).astype('uint8')
                image = Image.fromarray(output).convert('RGB')
            images[i, ...] = image
        return images 

In [12]:
%%time
BATCH = 64
STEPS = len(test_paths) // BATCH

test_seq = TestBatchSequence(test_paths, BATCH)

probs = model.predict_generator(
    test_seq,
    steps = STEPS + 1,
    workers = 5,
    verbose = 1
)


Wall time: 4min 6s


In [27]:
def generate_prob_labels(probas):
    label_preds = []
    for i in range(len(probas)):
        labels = []
        proba = list(probas[i])
        for i, elem in enumerate(proba):
            if elem > 0.04:
                labels.append(i + 1)
        label_preds.append(labels)
    return label_preds

prob_labels = generate_prob_labels(probs)
print(prob_labels[:5])
# print(generate_prob_labels

[[17, 36, 62, 66, 105, 214, 225], [17, 20, 62, 66, 105, 154, 164, 214], [2, 44, 66, 71, 139, 154, 158, 180, 186], [53, 66, 82, 138, 153, 164, 190], [36, 62, 66, 70, 133, 153, 171, 184]]


In [28]:
# images = [np.array(Image.open(path)) for path in test_paths[:1000]]
# subset = test_paths[:1000]
images = np.empty([len(test_paths), IMAGE_SIZE, IMAGE_SIZE, 3], dtype='uint8')
for i, path in enumerate(test_paths):
    try:
        image = np.array(Image.open(path))
    except Exception as e:
        print(e)
        output = [1]*(IMAGE_SIZE*IMAGE_SIZE*3)
        output = np.array(output).reshape(IMAGE_SIZE,IMAGE_SIZE,3).astype('uint8')
        image = Image.fromarray(output).convert('RGB')
    images[i, ...] = image

evals = model.predict_proba(images)

In [10]:
print(probs.shape)
# print(evals.shape)

def generate_label_predictions(probas):
    label_preds = []
    for i in range(len(probas)):
        labels = []
        proba = list(probas[i])
        for i, elem in enumerate(proba):
            if elem == 1.0:
                labels.append(i + 1)
        label_preds.append(labels)
    return label_preds

label_predicts = generate_label_predictions(probs)

for i in range(10):
    print(label_predicts[i])

(39706, 228)
[1, 10, 11, 12, 14, 15, 16, 17, 19, 21, 28, 29, 32, 36, 41, 43, 44, 45, 46, 47, 48, 49, 53, 56, 57, 62, 63, 66, 67, 71, 74, 75, 76, 78, 83, 87, 88, 89, 90, 92, 93, 94, 95, 96, 99, 106, 107, 108, 113, 115, 118, 119, 120, 123, 128, 131, 132, 133, 134, 136, 139, 140, 142, 143, 147, 148, 151, 152, 153, 157, 158, 161, 164, 165, 169, 171, 179, 182, 184, 185, 186, 189, 191, 194, 196, 197, 198, 205, 207, 209, 212, 215, 216, 218, 219, 221, 224, 225, 228]
[1, 10, 11, 12, 14, 15, 16, 17, 19, 21, 28, 29, 32, 36, 41, 43, 44, 45, 46, 47, 48, 49, 53, 56, 57, 62, 63, 66, 67, 71, 74, 75, 76, 78, 83, 88, 89, 90, 92, 93, 94, 95, 96, 98, 99, 106, 107, 108, 113, 115, 118, 119, 120, 123, 128, 131, 132, 133, 134, 136, 139, 140, 142, 143, 147, 148, 151, 152, 153, 157, 158, 161, 164, 165, 169, 171, 179, 182, 184, 185, 186, 189, 194, 196, 197, 198, 205, 207, 209, 212, 215, 216, 218, 219, 221, 224, 225, 228]
[1, 10, 11, 12, 14, 15, 16, 17, 19, 21, 28, 29, 32, 36, 41, 43, 44, 45, 46, 47, 48, 49, 53, 

In [29]:
with open("submission-4-6-2018.csv","w") as f:
    f.write("image_id,label_id\n")
    for i, labels in tqdm(enumerate(prob_labels), total = len(prob_labels)):
        output_labels = " ".join(str(x) for x in labels)
        f.write("{},{}\n".format(i + 1, output_labels))

# with open("submission-6.1.csv","w") as f:
#     f.write("image_id, label_id\n")
#     for i, labels in tqdm(enumerate(label_predicts), total = len(label_predicts)):
#         output_labels = " ".join(str(x) for x in labels)
#         f.write("{}, {}\n".format(i + 1, output_labels))




In [12]:
def path_to_image(path):
    img = Image.open(path)
    img.thumbnail((IMAGE_SIZE, IMAGE_SIZE))
    image = np.array(img)
    return image
#     return np.array(Image.open(path))

In [None]:
with open("submission.csv","w") as f:
    f.write("image_id,label_id\n")
    for i, image_url in tqdm(enumerate(test_paths), total = len(test_paths)):
        # image = url_to_image(image_url)
        image = path_to_image(DATA_DIR + "test/{}.jpg".format(i + 1))
        prob = model.predict_proba(image.reshape(1, IMAGE_SIZE, IMAGE_SIZE, 3))
        sorted_args = np.argsort(prob)[0][::-1][:10]
        output_labels = " ".join(str(x) for x in sorted_args)
#         print("{}, {}\n".format(i + 1, output_labels))
        f.write("{},{}\n".format(i + 1, output_labels))