In [1]:
import os
import sys
import wget
import json
import gzip
import re
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain
from google.cloud import vision
from google.cloud.vision import types
from google.protobuf.json_format import MessageToDict
from datetime import datetime, timedelta

In [None]:
# Data source: http://deepyeti.ucsd.edu/jianmo/amazon/index.html
product_url = 'http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/meta_Clothing_Shoes_and_Jewelry.json.gz'
filename = wget.download(product_url)

In [None]:
# Load metadata
data = []
with gzip.open('meta_Clothing_Shoes_and_Jewelry.json.gz') as f:
   for l in f:
       item = l.strip().decode("utf-8")
       data.append(json.loads(item))

# Create dataframe
df = pd.DataFrame.from_dict(data)

# Remove duplicates - keep last
df.drop_duplicates(subset='title', keep='last', inplace=True)
df.shape

In [None]:
# Specific filtering for training data

# Some code is embeded in the title column for several rows. Identify rows where the
# count is very high
df['title_length'] = df['title'].str.len()

# Filter df for those with count less than 50000
df = df[df['title_length'] < 50000]

# List of columns that must have a value for the product to be included
key_cols = ['asin', 'title', 'brand', 'description', 'image', 'feature', 'category', 'similar_item']

# Count non-NaN values across key columns
df['count'] = df[key_cols].count(axis=1)

# Filter df and select columns of interest
df = df[df['count'] >= 8]
df = df[key_cols + ['also_buy', 'also_view']]

# Identify products with 'women' or 'men' in title (used to )
df = df[df['adj_title'].str.contains('women| men')].copy()

In [None]:
# Functions to clean title
def remove_num_and_empty_strings(lst):
   return [s for s in lst if (not any(map(str.isdigit, s))) & (len(s) != 0)]

def clean_title(row):
    title, brand = row[['title', 'brand']]
    adj_title = re.sub('[^a-zA-Z\d\s:]', '', title.lower())
    title_tokens = remove_num_and_empty_strings(adj_title.split(' '))
    brand_tokens = brand.lower().split(' ')
    adj_title_tokens = list(set(title_tokens) - set(brand_tokens))
    adj_title = ' '.join(adj_title_tokens)
    return adj_title

# Clean title
df['adj_title'] = df.apply(clean_title, axis=1)

# Clean brand
df['adj_brand'] = df['brand'].apply(lambda x: re.sub('\s', '', x.lower()))

In [None]:
# Clean category
def filter_category(ctgy):
    ctgy = list(chain.from_iterable(list(ctgy)))
    ctgy = [item.split(' ') for item in ctgy]
    filt_ctgy = [''.join(item) for item in ctgy if ('&' in item) | (len(item)==1)]
    return [item.lower() for item in filt_ctgy]
    
df['category_filt'] = df[['category']].apply(filter_category, axis=1)

In [None]:
# Extract ids of similar items
df['similar_item_ids'] = df['similar_item']. \
    apply(lambda items: [items[i]['asin'] for i in range(len(items)) if items[i]['asin'] != ''])

In [None]:
# Get labels for images

def detect_labels(uri):
    
    """Detects labels in the file located in Google Cloud Storage or on the Web."""

    # Instantiates a client
    client = vision.ImageAnnotatorClient()

    image = vision.types.Image()
    image.source.image_uri = uri

    # Performs label detection on the image file
    response = client.label_detection(image=image)  

    # Convert the response to dictionary
    resp_dict = MessageToDict(response)
    
    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    
    # Extract annotations
    annotations = resp_dict['labelAnnotations']
    
    # Extract descriptions
    labels = [re.sub('\s', '', annotations[i]['description'].lower()) for i in range(len(annotations))]

    return labels

images = df['image'].tolist()
images = [item for sublist in images for item in sublist]
image_count = len(images)

def clean_uri_and_retrieve_labels(uri):
    index = images.index(uri)
    remaining = image_count - index
    percent_complete = round(index / image_count * 100, 2)
    hours_left = remaining / 8 / 60
    delta = timedelta(hours=hours_left)
    estimated_completion = datetime.now() + delta
    print('\nWorking on image', index, 'of', image_count, '.', 
          percent_complete, '% complete. At 8 responses',
          'per minute, completion in', hours_left, 'hours.',
          'Estimated completion: ', estimated_completion)
    clean_uri = re.sub('[_].+[_.]', '', uri)
    print("Cleaned uri is: ", clean_uri)
    try:
        labels = detect_labels(clean_uri)
        print("Successfully processed uri:", uri)
        return labels
    except:
        print("****WARNING: Could not process uri: ", uri)
        return [""]
        
def label_images(uris):
    if isinstance(uris, str):
        return clean_uri_and_retrieve_labels(uris)
    else:
        labels = [clean_uri_and_retrieve_labels(uri) for uri in uris]
        return list(set([item for sublist in labels for item in sublist]))

In [None]:
# Retrieve image labels - this will be lengthy process (~8 images per minute). Also, you must have
# access to the Google Cloud Vision API: https://cloud.google.com/vision/docs/quickstart-client-libraries
# Bulk processing is also available, but not utilized here.
df['image_labels'] = df['image'].apply(label_images)

In [None]:
# Check result
print(df[['adj_title', 'image_labels']].iloc[0])

In [None]:
# Combine adj_title, adj_brand, category_filt, and image_labels into 'document' (sentence)

# Recursive flattening of lists
def flatten(S):
    if S == []:
        return S
    if isinstance(S[0], list):
        return flatten(S[0]) + flatten(S[1:])
    return S[:1] + flatten(S[1:])

# Create set from list (to remove duplicates) and join into single string
def concat_descrip(S):
    lst = flatten(list(S))
    words = flatten([w.split(' ') for w in lst])
    return ' '.join(list(set(words)))

df['document'] = df[['adj_title', 'adj_brand', 'category_filt', 'image_labels']].apply(concat_descrip, axis=1)

In [None]:
# Reset index and save
df.reset_index(drop=True)

# Check first row
print(df.iloc[0])

# Save data
df.to_pickle("df_with_image_labels.pkl")