# Farfetech case study

Product description generation.

The object of this script is using deep learning technologies (CNN, LSTM) for product description generation.

- Author: Kai Chen
- Date: Apr, 2018


### Reference
- https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/



In [4]:
import string
import os
import sys
import itertools
import operator
from random import shuffle

import numpy as np
from numpy import argmax

import pandas as pd

from pickle import dump
from pickle import load

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras import callbacks, applications, optimizers
from keras.models import load_model
from keras.applications.vgg16 import preprocess_input

from nltk.translate.bleu_score import corpus_bleu

np.random.seed(42)

In [5]:
# ---------------------
# Define the file paths

PRODUCT_CSV_FILE = 'data/products.csv'
ATTRIBUTE_CSV_FILE = 'data/attributes.csv'

## Step 1: read and explore the data

In [6]:
df_product = pd.read_csv(PRODUCT_CSV_FILE)

display(df_product.head(5))
display(df_product.shape)

Unnamed: 0,ProductId,Description,DescriptionDate,SeasonOriginal,ProductName,Brand,Gender,Colour,Family,Category,ArticlePhotoId,CreateDate
0,11295277,VIPE6CE-169953MCC 38NO,2016-01-07 13:13:09.527,SS15,Celine Black Phantom Bag,CELINE COLLARD,WOMEN,BLACK,Bags,Tote Bags,6129459,2016-01-07 13:10:46.507
1,11292059,Grey cotton 'Skip' crewneck from S.N.S. Herni...,2016-01-08 14:45:59.673,SS16,'Skip' crewneck,S.N.S. HERNING,MEN,GREY,Clothing,Sweaters & Knitwear,6156126,2016-01-04 19:52:05.203
2,11290981,Multicolour cotton 'Pak’r Tatenda' backpack fr...,2016-01-11 19:27:45.330,SS16,'Pakr Tatenda' backpack,EASTPAK,UNISEX,YELLOW & ORANGE,Bags,Backpacks,6216609,2016-01-03 15:21:20.480
3,11293179,Ivory white and black hemp-cotton blend 'Honey...,2016-01-13 11:33:11.150,SS16,'Honey' wide brim hat,EUGENIA KIM,WOMEN,WHITE,Accessories,Hats,6199465,2016-01-05 18:08:57.317
4,11293099,Ivory white cotton embroidered lace frilled dr...,2016-01-13 15:22:08.247,SS16,embroidered lace frilled dress,RED VALENTINO,WOMEN,WHITE,Clothing,Dresses,6173011,2016-01-05 16:55:35.427


(12631, 12)

In [7]:
list_product_id_df = df_product['ProductId'].unique()
list_product_id_df = np.array(list_product_id_df)

print('number of products {} in the csv file'.format(list_product_id_df.shape[0]))

number of products 12631 in the csv file


In [8]:
# Create a dictionary with key: photo id -> value: product id
# Note one photo belongs only to one product

list_photo_id = df_product['ArticlePhotoId'].unique()

dict_photo_product_id = dict()

for photo_id in list_photo_id:
    dict_photo_product_id[photo_id] = df_product[df_product['ArticlePhotoId']==photo_id]['ProductId'].values[0]

In [11]:
# Update the list_product_id, such that each product should have an image

list_product_id = []

# img_width, img_height = 100, 100
# img_dir_path = "data/images_{}_{}/".format(img_width, img_height)
# img_width, img_height = 100, 100
img_dir_path = "data/images/"

dirs = os.listdir(img_dir_path)

for file_name in dirs:
    file_path = os.path.join(img_dir_path, file_name)
    product_id = int(file_name.split('_')[0])

    if not product_id in list_product_id_df:
        print('photo {} does not have product information'.format(file_path))
    else:
        list_product_id.append(product_id)
    
# print(list_product_id)
print('number of products: {}'.format(len(list_product_id)))

number of products: 12436


## Step 2: image data preparation

In [None]:
print('preparing the image data ...')

# extract VGG16 features
def extract_features(dict_product_img):
    # model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(img_width, img_height, nb_channel))
    model = applications.VGG16()
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    print(model.summary())
    features = dict()
    for product_id, img in dict_product_img.items():
        feature = model.predict(img, verbose=0)
        features[product_id] = feature

    return features

# Create a dictionary with
# key: product id, value: image
dict_product_img = dict()

img_width, img_height = 224, 224

img_dir_path = "data/images/"
dirs = os.listdir(img_dir_path)

product_image_feature_file_path = 'product-vgg-features.pkl'

for file_name in dirs:
    file_path = os.path.join(img_dir_path, file_name)

    # img = load_img(file_path)         
    img = load_img(file_path, target_size=(img_width, img_height))   # this is a PIL image
    x = img_to_array(img)                                            # this is a Numpy array with shape (img_width, img_height, 3)
    x = x.reshape((1, x.shape[0], x.shape[1], x.shape[2]))           # this is a Numpy array with shape (1, 3, img_width, img_height)
    # x = x.reshape((1,) + x.shape)                                  
    # prepare the image for the VGG model
    x = preprocess_input(x)
    product_id = int(file_name.split('_')[0])

    if not int(product_id) in list_product_id:
        print('photo {} does not have product information'.format(file_path))
    else:
        dict_product_img[product_id] = x

for product_id in list_product_id_df:
    if product_id not in dict_product_img:
        print('product {} does not have an image'.format(product_id))

# extract VGG16 features
dict_product_img_features = extract_features(dict_product_img)
# save the features to file
dump(dict_product_img_features, open(product_image_feature_file_path, 'wb'))

print('save product image features to {}'.format(product_image_feature_file_path))


## Step 3: prepare text data

In [None]:
def clean_descriptions(descriptions):
    """
    https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
    - Convert all words to lowercase.
    - Remove all punctuation.
    - Remove all words that are one character or less in length (e.g. ‘a’).
    - Remove all words with numbers in them.
    """
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc in descriptions.items():
        # tokenize
        desc = desc.split()
        # convert to lower case
        desc = [word.lower() for word in desc]
        # remove punctuation from each token
        desc = [w.translate(table) for w in desc]
        # remove hanging 's' and 'a'
        desc = [word for word in desc if len(word) > 1]
        # remove tokens with numbers in them
        desc = [word for word in desc if word.isalpha()]
        # store as string
        clean_str = ' '.join(desc)
        if not clean_str:
            print('cleaned description of product {} is empty'.format(key))
        else:
            descriptions[key] = clean_str


product_description_file_path = 'product-descriptions.txt'