# Farfetech case study

Product description generation.

The object of this script is using deep learning technologies (CNN, LSTM) for product description generation.

- Author: Kai Chen
- Date: Apr, 2018


### Reference
- https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/



In [4]:
import string
import os
import sys
import itertools
import operator
from random import shuffle

import numpy as np
from numpy import argmax

import pandas as pd

from pickle import dump
from pickle import load

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras import callbacks, applications, optimizers
from keras.models import load_model
from keras.applications.vgg16 import preprocess_input

from nltk.translate.bleu_score import corpus_bleu

np.random.seed(42)

In [5]:
# ---------------------
# Define the file paths

PRODUCT_CSV_FILE = 'data/products.csv'
ATTRIBUTE_CSV_FILE = 'data/attributes.csv'

## Step 1: read and explore the data

In [6]:
df_product = pd.read_csv(PRODUCT_CSV_FILE)

display(df_product.head(5))
display(df_product.shape)

Unnamed: 0,ProductId,Description,DescriptionDate,SeasonOriginal,ProductName,Brand,Gender,Colour,Family,Category,ArticlePhotoId,CreateDate
0,11295277,VIPE6CE-169953MCC 38NO,2016-01-07 13:13:09.527,SS15,Celine Black Phantom Bag,CELINE COLLARD,WOMEN,BLACK,Bags,Tote Bags,6129459,2016-01-07 13:10:46.507
1,11292059,Grey cotton 'Skip' crewneck from S.N.S. Herni...,2016-01-08 14:45:59.673,SS16,'Skip' crewneck,S.N.S. HERNING,MEN,GREY,Clothing,Sweaters & Knitwear,6156126,2016-01-04 19:52:05.203
2,11290981,Multicolour cotton 'Pak’r Tatenda' backpack fr...,2016-01-11 19:27:45.330,SS16,'Pakr Tatenda' backpack,EASTPAK,UNISEX,YELLOW & ORANGE,Bags,Backpacks,6216609,2016-01-03 15:21:20.480
3,11293179,Ivory white and black hemp-cotton blend 'Honey...,2016-01-13 11:33:11.150,SS16,'Honey' wide brim hat,EUGENIA KIM,WOMEN,WHITE,Accessories,Hats,6199465,2016-01-05 18:08:57.317
4,11293099,Ivory white cotton embroidered lace frilled dr...,2016-01-13 15:22:08.247,SS16,embroidered lace frilled dress,RED VALENTINO,WOMEN,WHITE,Clothing,Dresses,6173011,2016-01-05 16:55:35.427


(12631, 12)

In [7]:
list_product_id_df = df_product['ProductId'].unique()
list_product_id_df = np.array(list_product_id_df)

print('number of products {} in the csv file'.format(list_product_id_df.shape[0]))

number of products 12631 in the csv file


In [8]:
# Create a dictionary with key: photo id -> value: product id
# Note one photo belongs only to one product

list_photo_id = df_product['ArticlePhotoId'].unique()

dict_photo_product_id = dict()

for photo_id in list_photo_id:
    dict_photo_product_id[photo_id] = df_product[df_product['ArticlePhotoId']==photo_id]['ProductId'].values[0]

In [11]:
# Update the list_product_id, such that each product should have an image

list_product_id = []

# img_width, img_height = 100, 100
# img_dir_path = "data/images_{}_{}/".format(img_width, img_height)
# img_width, img_height = 100, 100
img_dir_path = "data/images/"

dirs = os.listdir(img_dir_path)

for file_name in dirs:
    file_path = os.path.join(img_dir_path, file_name)
    product_id = int(file_name.split('_')[0])

    if not product_id in list_product_id_df:
        print('photo {} does not have product information'.format(file_path))
    else:
        list_product_id.append(product_id)
    
# print(list_product_id)
print('number of products: {}'.format(len(list_product_id)))

number of products: 12436


## Step 2: image data preparation

In [None]:
print('Preparing the image data ...')

# extract VGG16 features
def extract_features(dict_product_img):
    # model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(img_width, img_height, nb_channel))
    model = applications.VGG16()
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    print(model.summary())
    features = dict()
    for product_id, img in dict_product_img.items():
        feature = model.predict(img, verbose=0)
        features[product_id] = feature

    return features

# Create a dictionary with
# key: product id, value: image
dict_product_img = dict()

img_width, img_height = 224, 224

img_dir_path = "data/images/"
dirs = os.listdir(img_dir_path)

product_image_feature_file_path = 'product-vgg-features.pkl'

for file_name in dirs:
    file_path = os.path.join(img_dir_path, file_name)

    # img = load_img(file_path)         
    img = load_img(file_path, target_size=(img_width, img_height))   # this is a PIL image
    x = img_to_array(img)                                            # this is a Numpy array with shape (img_width, img_height, 3)
    x = x.reshape((1, x.shape[0], x.shape[1], x.shape[2]))           # this is a Numpy array with shape (1, 3, img_width, img_height)
    # x = x.reshape((1,) + x.shape)                                  
    # prepare the image for the VGG model
    x = preprocess_input(x)
    product_id = int(file_name.split('_')[0])

    if not int(product_id) in list_product_id:
        print('photo {} does not have product information'.format(file_path))
    else:
        dict_product_img[product_id] = x

for product_id in list_product_id_df:
    if product_id not in dict_product_img:
        print('product {} does not have an image'.format(product_id))

# extract VGG16 features
dict_product_img_features = extract_features(dict_product_img)
# save the features to file
dump(dict_product_img_features, open(product_image_feature_file_path, 'wb'))

print('save product image features to {}'.format(product_image_feature_file_path))


## Step 3: prepare text data

In [None]:
print('Preparing text data ...')

def clean_descriptions(descriptions):
    """
    https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
    - Convert all words to lowercase.
    - Remove all punctuation.
    - Remove all words that are one character or less in length (e.g. ‘a’).
    - Remove all words with numbers in them.
    """
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc in descriptions.items():
        # tokenize
        desc = desc.split()
        # convert to lower case
        desc = [word.lower() for word in desc]
        # remove punctuation from each token
        desc = [w.translate(table) for w in desc]
        # remove hanging 's' and 'a'
        desc = [word for word in desc if len(word) > 1]
        # remove tokens with numbers in them
        desc = [word for word in desc if word.isalpha()]
        # store as string
        clean_str = ' '.join(desc)
        if not clean_str:
            print('cleaned description of product {} is empty'.format(key))
        else:
            descriptions[key] = clean_str

def to_vocabulary(descriptions):
    """
    convert the loaded descriptions into a vocabulary of words
    """
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc


def save_descriptions(descriptions, filename):
    """
    save descriptions to file, one per line
    """
    lines = list()
    for key, desc in descriptions.items():
        if not desc:
            print('product {} does not have a description'.format(key))
        # print(key)
        # print(desc)
        lines.append(str(key) + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    

product_description_file_path = 'product-descriptions.txt'

# create a dictionary with key: product id -> value: description
dict_product_des = dict()

for product_id in list_product_id:
    # we assume that one product has only one description.
    if product_id in dict_product_des:
        print('product {} has more than one description'.format(product_id))
    description = df_product[df_product['ProductId']==product_id]['Description'].values[0]
    if not description:
        print('product {} does not have a description'.format(product_id))
    else:
        dict_product_des[product_id] = description

print('before clean')
for product_id in list_product_id[0:5]:
    print(dict_product_des[product_id])

# clean the descriptions
clean_descriptions(dict_product_des)

print('after clean')
for product_id in list_product_id[0:5]:
    print(dict_product_des[product_id])

# summarize vocabulary
vocabulary = to_vocabulary(dict_product_des)
print('Vocabulary Size: %d' % len(vocabulary))

# save descriptions
save_descriptions(dict_product_des, product_description_file_path)

print('save product description to {}'.format(product_description_file_path))
    

## Step 4: prepare the train and test sets

In [None]:

def load_doc(filename):
    """
    load doc into memory
    """
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def load_clean_descriptions(filename, list_product_id):
    """
    load clean descriptions into memory
    """
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        product_id, product_desc = int(tokens[0]), tokens[1:]
        if product_id in list_product_id:
            if product_id not in descriptions:
                descriptions[product_id] = list()
            desc = 'startseq ' + ' '.join(product_desc) + ' endseq'
            descriptions[product_id].append(desc)

    # for key, value in descriptions.items():
    #     print(key)
    #     print(value)
    return descriptions



def load_photo_features(filename, list_product_id):
    """
    load photo features
    """
    
    # load all features
    all_features = load(open(filename, 'rb'))

    # features = {k: all_features[k] for k in list_product_id}

    # filter features
    dataset = []
    # dict_features = dict()
    for product_id in list_product_id:
        if (str(product_id) in all_features) or (product_id in all_features):
            dataset.append(product_id)

    # for product_id, features in all_features.items():
    #     if int(product_id) in list_product_id:
    #         dict_features[int(product_id)] = features

    # filter features
    features = {int(k): all_features[k] for k in dataset}

    return features


def to_lines(descriptions):
    """
    convert a dictionary of clean descriptions to a list of descriptions
    """
    
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc


def create_tokenizer(descriptions):
    """
    fit a tokenizer given caption descriptions
    """
    
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


def max_length(descriptions):
    """
    calculate the length of the description with the most words
    """
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)


# ---------
# encode the text
# for example, the input sequence “little girl running in field” would be split into 6 input-output pairs to train the model:
"""
X1,		X2 (text sequence), 						 y (word)
photo	startseq, 									 little
photo	startseq, little,							 girl
photo	startseq, little, girl, 					 running
photo	startseq, little, girl, running, 			 in
photo	startseq, little, girl, running, in, 		 field
photo	startseq, little, girl, running, in, field,  endseq
"""

def create_sequences(tokenizer, max_length, desc_list, photo):
    """
    create sequences of images, input sequences and output words for an image
    """
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)


def data_generator(descriptions, photos, tokenizer, max_length):
    """
    data generator, intended to be used in a call to model.fit_generator()
    """
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            if key in photos:
                photo = photos[key][0]
                in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
                yield [[in_img, in_seq], out_word]
                

# image features
all_features = load_photo_features(product_image_feature_file_path, list_product_id)
print('All image features: %d' % len(all_features))

# descriptions
all_descriptions = load_clean_descriptions(product_description_file_path, list_product_id)
print('Descriptions: %d' % len(all_descriptions))

# print('Descriptions')
# for key, value in all_descriptions.items():
#     print(key)
#     print(value)

# prepare train and test sets
percentage_train = 0.9
list_train_product_id = list_product_id[0:int(len(list_product_id)*percentage_train)]
list_test_product_id = list_product_id[len(list_train_product_id):]

train_features = dict()
train_descriptions = dict()
for product_id in list_train_product_id:
    train_features[product_id] = all_features[product_id]
    train_descriptions[product_id] = all_descriptions[product_id]

print('Descriptions: train=%d' % len(train_descriptions))
print('Photos: train=%d' % len(train_features))

test_features = dict()
test_descriptions = dict()
for product_id in list_test_product_id:
    test_features[product_id] = all_features[product_id]
    test_descriptions[product_id] = all_descriptions[product_id]

print('Descriptions: test=%d' % len(test_descriptions))
print('Photos: test=%d' % len(test_features))


# prepare sequences
# X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features)

# prepare tokenizer
tokenizer = create_tokenizer(all_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length
max_length = max_length(all_descriptions)
print('Description Length: %d' % max_length)


## Step 5: define the model

In [None]:
def define_model(vocab_size, max_length):
    """
    define the captioning model
    """
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    # summarize model
    print(model.summary())
    # plot_model(model, to_file='model.png', show_shapes=True)
    return model


# define the model
model = define_model(vocab_size, max_length)

## Step 6: train the model

In [None]:
print('Train model ... ')
# train the model, run epochs manually and save after each epoch
epochs = 2
steps = len(train_descriptions)
for i in range(epochs):
    # create the data generator
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    # fit for one epoch
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    # save model
    model_path = 'model_' + str(i) + '.h5'
    model.save(model_path)
    print('save model to {}'.format(model_path))