In [None]:
!pip install kaggle
!mkdir .kaggle
!touch .kaggle/kaggle.json 
!chmod 600 .kaggle/kaggle.json
# add kaggle_creds to kaggle.json
!kaggle competitions download -c imaterialist-challenge-fashion-2018

In [None]:
!unzip /content/.kaggle/competitions/imaterialist-challenge-fashion-2018/test.json.zip -d data/
!unzip /content/.kaggle/competitions/imaterialist-challenge-fashion-2018/train.json.zip -d data/
!unzip /content/.kaggle/competitions/imaterialist-challenge-fashion-2018/validation.json.zip -d data/

In [2]:
import json
import threading
import json
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from urllib.request import urlopen
from PIL import Image
from skimage.io import imread
from skimage.transform import resize

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score

from keras.utils.data_utils import Sequence
from keras.callbacks import ModelCheckpoint   
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, GlobalAveragePooling2D, GlobalMaxPooling2D, MaxPooling2D
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input, decode_predictions

DATA_DIR = "data/"
NUM_CLASSES = 228
IMAGE_SIZE = 64

In [3]:
with open(DATA_DIR + "train.json") as train, open(DATA_DIR + "test.json") as test, open(DATA_DIR + "validation.json") as validation:
    train_json = json.load(train)
    test_json = json.load(test)
    validation_json = json.load(validation)
    
train_urls = [obj['url'] for obj in train_json['images']]
test_urls = [obj['url'] for obj in test_json['images']]
validation_urls = [obj['url'] for obj in validation_json['images']]

print(train_urls[:3])
print(test_urls[:3])
print(validation_urls[:3])

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.json'

In [None]:
def generate_label_array(json_obj):
    result = []
    for data in json_obj['annotations']:
        temp_array = [0] * NUM_CLASSES
        for elem in data['labelId']:
            temp_array[int(elem) - 1] = 1
        result.append(temp_array)
    return np.array(result)

train_labels = generate_label_array(train_json)
validation_labels = generate_label_array(validation_json)

In [None]:
TARGET_SIZE=(IMAGE_SIZE,IMAGE_SIZE)

rand_img = np.random.randint(0, len(train_urls))
img_label = np.array(train_labels[rand_img]).reshape(1, 228)
img_path = train_urls[rand_img]
img_file = urlopen(img_path)
image = Image.open(img_file)
image_resized = image.resize(TARGET_SIZE, Image.ANTIALIAS)
image_resized.thumbnail(TARGET_SIZE, Image.ANTIALIAS)
plt.imshow(np.asarray(image_resized))
plt.show()

In [None]:
class BatchSequence(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        images = [self.url_to_image(file_name) for file_name in batch_x]
        return np.array(images), np.array(batch_y)
    
    def url_to_image(self, url):
        try:
            resp = urllib.urlopen(url)
            image = np.asarray(bytearray(resp.read()), dtype='uint8')
            image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        except:
            output = [1]*(IMAGE_SIZE*IMAGE_SIZE*3)
            output = np.array(output).reshape(IMAGE_SIZE,IMAGE_SIZE,3).astype('uint8')
            image = Image.fromarray(output).convert('RGB')
        # convert image from BGR to RGB
        image = image[...,::-1]
        image = np.array(image)
        image = resize(image, (IMAGE_SIZE, IMAGE_SIZE))
        return image

In [None]:
conv_base = VGG16(
    weights='imagenet',
    input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3),
    include_top = False,
    classes = NUM_CLASSES
)

model = Sequential()
model.add(conv_base)
model.add(GlobalMaxPooling2D())
model.add(Dropout(0.3))
model.add(Dense(30, activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(NUM_CLASSES, activation='softmax'))
conv_base.trainable = False
print(model.summary())

In [None]:
# subset the images since it takes forever
train_urls = train_urls[:200000]
train_labels = train_labels[:200000]

EPOCHS = 1
BATCH = 128
STEPS = len(train_urls) // BATCH
VAL_STEPS = len(validation_urls) // BATCH

train_gen = BatchSequence(train_urls, train_labels, BATCH)
val_gen = BatchSequence(validation_urls, validation_labels, BATCH)

model.compile(
    loss='categorical_crossentropy', 
    optimizer='rmsprop', 
    metrics=['accuracy']
)

checkpointer = ModelCheckpoint(
    filepath='model.best.hdf5', 
    verbose=1,
    save_best_only=True
)

history = model.fit_generator(
    generator = train_gen,
    validation_data = val_gen,
    epochs = EPOCHS,
    steps_per_epoch = STEPS,
    callbacks = [checkpointer],
)