In [1]:
import os
import h5py
import numpy as np
import pandas as pd
import itertools
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt
%matplotlib inline

Using Theano backend.


### Create labels

In [16]:
train_dir = '../data/images_clothes/train_subset/'
val_dir = '../data/images_clothes/val_subset/'
test_dir = '../data/images_clothes/test_subset/'

In [3]:
def create_category_dict(train_dir):
    categories = os.listdir(train_dir)
    
    # Initialize category dict
    category_dict = dict()
    idx = 0
    for category in categories:
        if category != '.DS_Store':
            category_dict[idx] = category
            idx += 1
    
    return category_dict

In [4]:
category_dict = create_category_dict(train_dir)

In [5]:
category_dict

{0: 'Clothing, Shoes & Jewelry -> Men -> Accessories -> Bow Ties & Cummerbunds -> Bow Ties',
 1: 'Clothing, Shoes & Jewelry -> Men -> Accessories -> Gloves & Mittens -> Cold Weather Gloves',
 2: 'Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Fedoras',
 3: 'Clothing, Shoes & Jewelry -> Men -> Clothing -> Active -> Active Shorts',
 4: 'Clothing, Shoes & Jewelry -> Men -> Clothing -> Active -> Athletic Socks',
 5: 'Clothing, Shoes & Jewelry -> Women -> Accessories -> Scarves & Wraps -> Cold Weather Scarves & Wraps',
 6: 'Clothing, Shoes & Jewelry -> Women -> Clothing -> Coats & Jackets -> Leather & Faux Leather',
 7: 'Clothing, Shoes & Jewelry -> Women -> Clothing -> Lingerie, Sleep & Lounge -> Intimates -> Panties -> Briefs',
 8: 'Clothing, Shoes & Jewelry -> Women -> Clothing -> Swimsuits & Cover Ups -> Tankinis'}

In [6]:
def create_labels(train_dir):
    train_tuples = list()
    train_labels = list()
    
    # Create tuples of category index and counts
    for image_dir in os.listdir(train_dir):
        if not image_dir.startswith('.'):
            image_count = len(os.listdir(os.path.join(train_dir, image_dir)))
            train_tuples.append((category_dict.keys()[category_dict.values().index(image_dir)], image_count))

    # Create training labels
    train_labels = to_categorical(list(itertools.chain.from_iterable([[tup[0]] * tup[1] for tup in train_tuples])))
    
    return train_tuples, train_labels

In [7]:
train_tuples, train_labels = create_labels(train_dir)

In [18]:
val_tuples, val_labels = create_labels(val_dir)

In [19]:
test_tuples, test_labels = create_labels(test_dir)

In [20]:
print train_labels.shape[0]
print val_labels.shape[0]
print test_labels.shape[0]

10465
1169
54


### Train model

In [24]:
weights_path = '../data/images/model/vgg16_weights.h5'
img_width, img_height = 150, 150
nb_train_samples = train_labels.shape[0]
nb_validation_samples = val_labels.shape[0]
nb_test_samples = test_labels.shape[0]
train_bottleneck_features_path = '../data/images/model/bottleneck_features_train_subset.npy'
val_bottleneck_features_path = '../data/images/model/bottleneck_features_val_subset.npy'
test_bottleneck_features_path = '../data/images/model/bottleneck_features_test_subset.npy'
nb_epoch = 3
top_model_weights_path = '../data/images/model/top_model_weights.h5'

In [22]:
def load_vgg16(weights_path='../data/images/model/vgg16_weights.h5'):
    datagen = ImageDataGenerator(rescale=1./255)

    # build the VGG16 network
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(3, img_width, img_height)))

    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # load the weights of the VGG16 networks
    # (trained on ImageNet, won the ILSVRC competition in 2014)
    # note: when there is a complete match between your model definition
    # and your weight savefile, you can simply call model.load_weights(filename)
    assert os.path.exists(weights_path), 'Model weights not found (see "weights_path" variable in script).'
    f = h5py.File(weights_path)
    for k in range(f.attrs['nb_layers']):
        if k >= len(model.layers):
            # we don't look at the last (fully-connected) layers in the savefile
            break
        g = f['layer_{}'.format(k)]
        weights = [g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])]
        model.layers[k].set_weights(weights)
    f.close()
    print('Model loaded.')

    return model

In [23]:
model = load_vgg16()

Model loaded.


In [None]:
generator = datagen.flow_from_directory(
        train_dir,
        target_size=(img_width, img_height),
        batch_size=32,
        class_mode=None,
        shuffle=False)
bottleneck_features_train = model.predict_generator(generator, nb_train_samples)
np.save(open(train_bottleneck_features_path, 'w'), bottleneck_features_train)

In [None]:
generator = datagen.flow_from_directory(
        val_dir,
        target_size=(img_width, img_height),
        batch_size=32,
        class_mode=None,
        shuffle=False)
bottleneck_features_validation = model.predict_generator(generator, nb_validation_samples)
np.save(open(val_bottleneck_features_path, 'w'), bottleneck_features_validation)

In [None]:
generator = datagen.flow_from_directory(
        test_dir,
        target_size=(img_width, img_height),
        batch_size=1,
        class_mode=None,
        shuffle=False)
bottleneck_features_test = model.predict_generator(generator, nb_test_samples)
np.save(open(test_bottleneck_features_path, 'w'), bottleneck_features_test)

In [None]:
train_data = np.load(open(train_bottleneck_features_path))

validation_data = np.load(open(val_bottleneck_features_path))

model = Sequential()
model.add(Flatten(input_shape=train_data.shape[1:]))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(output_dim=9, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(train_data, train_labels,
          nb_epoch=nb_epoch, batch_size=32,
          validation_data=(validation_data, validation_labels))
model.save_weights(top_model_weights_path)

### Check model on test_set

In [None]:
test_data = np.load(open(test_bottleneck_features_path))

In [None]:
preds = model.predict_classes(test_data)

In [None]:
preds_proba = model.predict_proba(test_data)

In [None]:
top = 3

In [None]:
indices = (-preds_proba).argsort()
top_preds = indices[:, :top]

In [None]:
image_path_list = list()
for image_dir in os.listdir(test_dir):
    if not image_dir.startswith('.'):
        for image in os.listdir(os.path.join(test_dir, image_dir)):
            if not image.startswith('.'):
                image_path = os.path.join(test_dir, image_dir, image)
                image_path_list += [image_path]
                # print image_path

In [None]:
for i, f in enumerate(image_path_list):
    im = plt.imread(f)
    plt.figure()
    plt.imshow(im.astype('uint8'))
    plt.axis('off')
    top_pred = top_preds[i]
    for n, label in enumerate(top_pred):
        plt.text(350, 50 + n*25, '{}. {}'.format(n+1, category_dict[label]))