In [2]:
import json
import pandas as pd
import numpy as np
import datetime
import traceback
from scipy import sparse
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load dataset

In [4]:
data_df = pd.read_parquet('./data/catalog_data')

In [5]:
data_df.head()

Unnamed: 0,product_url,product_name,pid,retail_price,discounted_price,image,description,product_rating,overall_rating,brand,...,spec_placket,spec_waistband,spec_weave type,spec_other bra details,spec_age group,spec_hem,spec_back,gender,cloth_type,mfr_code
18,http://www.flipkart.com/mario-gotze-women-s-pr...,Mario Gotze Women's Printed Casual Orange Shirt,SHTEJRCHC5KESPYX,1499.0,899.0,http://img5a.flixcart.com/image/shirt/x/z/8/rr...,Key Features of Mario Gotze Women's Printed Ca...,,,Mario Gotze,...,Classic Collar,,,,,,,Women,Shirts,b129a98c7c0e905ca82fd73db63d9bd0
19,http://www.flipkart.com/discountgod-men-s-chec...,Discountgod Men's Checkered Casual Shirt,SHTEBY72FGKPYBRU,750.0,590.0,http://img5a.flixcart.com/image/shirt/g/x/r/de...,Discountgod Men's Checkered Casual Shirt - Buy...,,,Slim,...,,,,,,,,Men,Shirts,05324708277dbf2f3e8b55a660e91988
20,http://www.flipkart.com/silver-streak-men-s-pr...,Silver Streak Men's Printed Casual Denim Shirt,SHTEB5MNYHJJQGBJ,1299.0,599.0,http://img6a.flixcart.com/image/shirt/g/b/j/ss...,Silver Streak Men's Printed Casual Denim Shirt...,,,Slim,...,,,,,,,,Men,Shirts,9b9dc9b2609d1ee4f5196221eca22eb7
30,http://www.flipkart.com/masaba-magic-fairy-gir...,Masaba for Magic fairy Girl's Layered Pink Dress,DREEJ3V5R9GQJNSY,1775.0,1598.0,http://img5a.flixcart.com/image/dress/g/r/f/mk...,Key Features of Masaba for Magic fairy Girl's ...,,,Masaba for Magic fairy,...,,,,,,,,Kid,Dresses,5a2a5a43c64bad7c50a8262e9d9901a2
32,http://www.flipkart.com/wake-up-competition-fu...,Wake Up Competition Full Sleeve Striped Men's ...,SWSEJFF8Z9DSYYGF,1495.0,748.0,http://img6a.flixcart.com/image/sweatshirt/q/s...,Key Features of Wake Up Competition Full Sleev...,,,Wake Up Competition,...,,,,,,,,Men,Sweatshirts,0e97b487f7bbe29daa0a95860e237998


## Content-based recommendation generator

In [9]:
import os
import sys
import pandas as pd
import numpy as np
import datetime
import traceback
from scipy import sparse
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


class ContentRecommendation():
    def __init__(self):
        self.vectorizer = None
        self.name_vectorizer = None
        self.common_vectorizer = None

    def initialize(self, product_count):
        # Boost the vector value of certain attributes
        self.boost = {
            'category': 30
        }

        def handle_nans(data):
            if data == 0 or data is np.nan:
                return ''
            else:
                return data
        tfidf_params = {
            'decode_error': 'ignore',
            'min_df': 2,
            'stop_words': 'english',
            'norm': None,
            'analyzer': 'word',
            'preprocessor': handle_nans,
        }
        # attribute name has a higher n-gram range
        name_tfidf_params = {
            'ngram_range': (1, 3),
            'max_features': 5 * product_count
        }
        name_tfidf_params.update(tfidf_params)
        common_tfidf_params = {
            'ngram_range': (1, 2),
            'max_features': min(2000, product_count)
        }
        common_tfidf_params.update(tfidf_params)
        self.name_vectorizer = TfidfVectorizer(**name_tfidf_params)
        self.common_vectorizer = TfidfVectorizer(**common_tfidf_params)

    def get_columns_to_remove(self, columns):
        remove_columns = [
            '_id', 'avlble', '_count', '_rank', 'created_on',
            'currency', 'description', 'image', 'last_', 'out_of_shelf', 'pgs',
            'plists', '_date', 'url', 'place_order', 'views', 'trend',
            'modified_on', 'updated', 'modified_on', 'product_code',
            'hpicked_order', 'sub_account', 'variants', 'push', 'az_boughtrank', 'plist_rank',
            'pid', 'product_url'
        ]

        def should_remove_column(column):
            if any([remove_substring in column.lower() for remove_substring in remove_columns]):
                return True
            return False

        return list(filter(should_remove_column, columns))

    def process_column(self, col):
        example_val_index = col.first_valid_index()
        if not example_val_index:
            return pd.Series(np.empty(col.shape[0]), index=col.index)
        column_type = type(col[example_val_index])

        def process_list(l):
            if type(l) is list:
                list_item = ' '.join(list(filter(lambda x: type(x) in [str] and len(x) > 0, l)))
                if len(list_item.strip()) > 0:
                    return list_item
            return np.nan

        def normalize_values(col):
            col = col.astype(float)
            min_value = col.min()
            max_value = col.max()
            normalized_values = ((col - min_value) / (max_value - min_value)).fillna(0)
            if col.name in self.boost:
                normalized_values = normalized_values * self.boost[col.name]
            return normalized_values

        numerics = [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64, int, float]
        if column_type in numerics:
            processed_col = normalize_values(col)
        elif column_type is list:
            processed_col = col.apply(process_list)
        elif column_type in [str]:
            processed_col = col
        else:
            processed_col = pd.Series(np.empty(col.shape[0]), index=col.index)
        return processed_col

    def get_tf_idf_vector(self, col):
        try:
            col_values = list(col.fillna('').values.astype(str))
            if col.name == 'name':
                tfidf_col = self.name_vectorizer.fit_transform(col_values)
            else:
                tfidf_col = self.common_vectorizer.fit_transform(col_values)
            if col.name in self.boost:
                tfidf_col = tfidf_col * self.boost[col.name]
            return tfidf_col
        except Exception as e:
            return None

    def process_columns(self, df):
        df.set_index('mfr_code', inplace=True)
        # deal with duplicate keys
        df = df[~df.index.duplicated(keep='first')]
        df.drop(self.get_columns_to_remove(df.columns), axis=1, inplace=True)
        print(df.columns)
        pdf = df.apply(self.process_column, axis=0)
        # drop columns that doesn't have a single value
        if len(pdf.shape) > 1:
            pdf.dropna(axis=1, how='all', inplace=True)
        return pdf

    def get_sparse_data(self, df):
        sparse_features = [self.get_tf_idf_vector(df[column]) for column in df.select_dtypes(exclude='number').columns]
        sparse_numeric_values = sparse.csr_matrix(np.nan_to_num(df[df.columns[df.dtypes != 'object']].values))
        sparse_features.append(sparse_numeric_values)
        sparse_features = [feature for feature in sparse_features if feature is not None]
        sparse_data = hstack(tuple(sparse_features))
        sparse_data.eliminate_zeros()
        return sparse_data

    def get_cosine_similarity(self, sparse_data_1, sparse_data_2):
        result = cosine_similarity(sparse_data_1, Y=sparse_data_2, dense_output=False)
        return result

    def get_related_products(self, cs_result, row_index=0, col_index=0, mfr_index=None):
        related_products = cs_result.tocoo(copy=False)
        rdf = pd.DataFrame({
            'index': related_products.row + row_index,
            'col': related_products.col + col_index,
            'data': related_products.data
        })
        return self.get_top_10_products(rdf)

    def get_top_10_products(self, all_scores_df):
        max_items = all_scores_df.groupby('index').count().col.max()
        item_count = min(10, max_items - 1)
        top_10_df = all_scores_df.sort_values('data', ascending=False).groupby('index').head(item_count)
        return top_10_df

    def generate_related_products(self, df):
        if df.shape[0] > 0:
            processed_df = self.process_columns(df)
            if len(processed_df.shape) == 1:
                print('ERROR No features found')
                return None, None
            sparse_df = self.get_sparse_data(processed_df)
            # if there are more than products_count_limit, process in batches to avoid 
            # memory error in computing cosine similarity
            products_count_limit = 2000
            if sparse_df.shape[0] > products_count_limit:
                split_limits = np.arange(0, sparse_df.shape[0], products_count_limit)
                if split_limits[-1] != sparse_df.shape[0]:
                    split_limits = np.append(split_limits, sparse_df.shape[0])
                print(split_limits)
                sparse_df = sparse_df.tocsr()
                splits = [sparse_df[start:end, :] for start, end in zip(split_limits[:-1], split_limits[1:])]
                row_index = 0
                col_index = 0
                df_result = pd.DataFrame()
                for i, s1 in enumerate(splits[:]):
                    col_index = 0
                    for j, s2 in enumerate(splits[:]):
                        cs = self.get_cosine_similarity(s1, s2)
                        rp = self.get_related_products(cs, row_index, col_index, df.index)
                        df_result = pd.concat([df_result, rp], ignore_index=True)
                        col_index += splits[j].shape[0]
                    row_index += splits[i].shape[0]
                df_result_top_10 = self.get_top_10_products(df_result)
                print('result_df' + str(df_result_top_10.shape))
            else:
                sparse_df.tocsr()
                cs = self.get_cosine_similarity(sparse_df, sparse_df)
                df_result = self.get_related_products(cs, 0, 0, df.index)
            return df_result, df.index, sparse_df
        return None, None

    def populate_related_products(self, result, mfr_index):
        results = {}
        for i, group in result.groupby(['index']):
            if (i % 1000) == 0 and i > 0:
                print(str(i) + ' products updated')
            max_vals = group.sort_values('data', ascending=False).head(10)
            mfr_code = mfr_index[i]
            if mfr_code is None:
                continue
            mfr_code = str(mfr_code)
            max_vals['col'] = max_vals['col'].apply(lambda x: mfr_index[int(x)])
            related_mfrs = {mfr: score for mfr, score in zip(list(max_vals['col'])[1:], list(max_vals['data'])[1:])}
            if mfr_code in related_mfrs:
                del related_mfrs[mfr_code]
            related_mfrs = {pid: score for pid, score in related_mfrs.items()}
            results[mfr_code] = related_mfrs
        return results

    def run(self, products_df):
        self.initialize(products_df.shape[0])
        results_df, df_index, sparse_df = self.generate_related_products(products_df)
        if results_df is not None:
            related_products = self.populate_related_products(results_df, df_index)
            return related_products, sparse_df
        return None

In [10]:
_cr = ContentRecommendation()

In [11]:
%timeit
related_products, sparse_df = _cr.run(data_df.copy())

Index(['product_name', 'retail_price', 'discounted_price', 'product_rating',
       'overall_rating', 'brand', 'base_category', 'category_level_2',
       'category_level_3', 'category_level_4', 'category_level_5',
       'category_level_6', 'cat_length', 'spec_fabric', 'spec_pattern',
       'spec_number of contents in sales package', 'spec_occasion',
       'spec_type', 'spec_sleeve', 'spec_style code', 'spec_fit', 'spec_neck',
       'spec_brand color', 'spec_color', 'spec_wire support', 'spec_straps',
       'spec_cup type', 'spec_brand fit', 'spec_detachable straps',
       'spec_length', 'spec_closure', 'spec_inner lining', 'spec_collar',
       'spec_seam type', 'spec_other details', 'spec_series', 'spec_design',
       'spec_pockets', 'spec_belt included', 'spec_hooded', 'spec_reversible',
       'spec_style', 'spec_knit type', 'spec_placket', 'spec_waistband',
       'spec_weave type', 'spec_other bra details', 'spec_age group',
       'spec_hem', 'spec_back', 'gender', 'cloth

## TSNE plot of product vectors

In [12]:
product_vectors = sparse_df.toarray()

In [13]:
product_vectors.shape

(2816, 5303)

In [20]:
import matplotlib
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3, verbose=10, init='pca', perplexity=50)
tsne_results_3 = tsne.fit_transform(product_vectors)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2816 samples in 0.380s...
[t-SNE] Computed neighbors for 2816 samples in 61.017s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2816
[t-SNE] Computed conditional probabilities for sample 2000 / 2816
[t-SNE] Computed conditional probabilities for sample 2816 / 2816
[t-SNE] Mean sigma: 9.240947
[t-SNE] Computed conditional probabilities in 0.231s
[t-SNE] Iteration 50: error = 71.1965561, gradient norm = 0.1044502 (50 iterations in 3.258s)
[t-SNE] Iteration 100: error = 75.5645676, gradient norm = 0.0599229 (50 iterations in 3.085s)
[t-SNE] Iteration 150: error = 75.7266922, gradient norm = 0.0500878 (50 iterations in 2.926s)
[t-SNE] Iteration 200: error = 76.9073715, gradient norm = 0.0646522 (50 iterations in 3.655s)
[t-SNE] Iteration 250: error = 77.8526154, gradient norm = 0.0736229 (50 iterations in 3.908s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 77.852615
[t-SNE] Iteration 300: er

In [32]:
tsne_3d_results = pd.DataFrame(tsne_results_3, index=data_df.index)
data_df['tsne_3d_0'] = tsne_3d_results[0]
data_df['tsne_3d_1'] = tsne_3d_results[1]
data_df['tsne_3d_2'] = tsne_3d_results[2]

In [34]:
import plotly
plotly.tools.set_credentials_file(username='gautham20', api_key='----')

import plotly.plotly as py
import plotly.graph_objs as go

In [35]:
data_df['cloth_type_gender'] = data_df['cloth_type'] + '_' + data_df['gender']

In [36]:
# remove outliers to get a focused plot
def check_outlier(x):
    return x.between(x.quantile(.1), x.quantile(.9))

data_df['outlier'] = ~(check_outlier(data_df['tsne_3d_0']) &  check_outlier(data_df['tsne_3d_1']) & check_outlier(data_df['tsne_3d_2']))

In [37]:
traces = []
import matplotlib.cm as cm
import matplotlib.pyplot as plt

products = data_df[~data_df['outlier']]
key = 'cloth_type_gender'

labels = np.unique(products[key].fillna('unknown'))
print(labels)
cmap = plt.get_cmap('cool')


colors = cmap(np.linspace(0,1, len(labels)))

for label, color in zip(labels, colors):
    data = products[products[key] == label]
    if data.shape[0] > 0:
        trace = go.Scatter3d(
            x = data['tsne_3d_0'],
            y = data['tsne_3d_1'],
            z = data['tsne_3d_2'],
            mode='markers',
            marker={'color': np.random.rand(), 'size': 4, 'opacity': 0.5},
            name=label,
            text=data['product_name'],
            hoverinfo="text",
            hoverlabel={'namelength': 30}
        )
        traces.append(trace)
layout_scene = {
      'xaxis': {
        'showspikes': False
      },
      'yaxis': {
        'showspikes': False
      },
      'zaxis': {
        'showspikes': False
      }
    }
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=50
    ),
    scene=layout_scene,
    showlegend=True
    #legend=dict(orientation="h", x=1, y=-1)
)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='content_based_recommendation')

['Blazers_Kid' 'Blazers_Men' 'Blazers_Women' 'Caps_Kid' 'Caps_Men'
 'Dresses_Kid' 'Dresses_Women' 'Formal Shirts_Men' 'Jackets_Kid'
 'Jackets_Men' 'Jackets_Women' 'Kurtis_Kid' 'Kurtis_Women'
 'Leggings Jeggings_Kid' 'Leggings Jeggings_Women' 'Leggings_Women'
 'Pants_Kid' 'Pants_Men' 'Pants_Women' 'Shirts Tops_Women' 'Shirts_Men'
 'Shirts_Women' 'Sweaters_Kid' 'Sweaters_Men' 'Sweaters_Women'
 'Sweatshirts_Men' 'Sweatshirts_Women' 'Ties_Men' 'Tops_Kid' 'Tops_Men'
 'Tops_Women']


## Display Recommendations

In [38]:
image_map = {}

In [81]:
from ipywidgets import HBox, Label, Image, HTML, Text, VBox, Layout
from IPython.core.display import HTML 
import requests
from collections import defaultdict

box_layout = Layout(he130ight='350px',
                    width='1000px')

def add_product_details(pid, score=None):
    product = data_df.loc[data_df['mfr_code'] == pid, :]
    image = product['image']
    product_dict = {
        'pid': pid,
        'image': image.iloc[0]
    }
    if score is not None:
        product_dict.update({'score': score})
    return product_dict

def get_product_detail(product_id, field):
    product_doc = product_details.loc[product_id]
    if product_doc is not None:
        return product_doc[field]
    return None

def get_image_content(url):
        if url in image_map:
            return image_map[url]
        else:
            try:
                content = requests.get(url).content
                image_map[url] = content
                return content
            except Exception as e:
                print('exception' + str(e))
                print(url)
    
def show_image(url):
    image_content = get_image_content(url)
    if not image_content:
        return Image(value = get_image_content('http://sparco-dev.buzztech.com/content/images/thumbs/default-image_450.png'),
            format='png',width=200, height=100)
    return Image(value = image_content,
    format='png',width=200, height=200)

def display_products(products):
    product_tiles = []
    if 'score' in products[0]:
        products = sorted(products, key=lambda x: x['score'], reverse=True)
    for product in products:
        product_tile = []
        product_tile.append(show_image(product['image']))
#         product_tile.append(Text(product['pid']))
#         if 'score' in product:
#             product_tile.append(Text(str(product['score'])))
        product_tiles.append(VBox(product_tile))
    count = len(product_tiles)
    grid = []
    for start in range(count)[::3]:
        end = start+3
        row = HBox(product_tiles[start:end], layout=box_layout)
        grid.append(row)
    return VBox(grid)

In [40]:
results = list(related_products.items())

In [73]:
import random
base_product, recommendations = list(related_products.items())[random.choice(range(len(results)))]

In [74]:
display_products([add_product_details(base_product)])

VBox(children=(HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x0…

In [75]:
display_products([add_product_details(pid, score) for pid, score in recommendations.items()])

VBox(children=(HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x0…

## Personalized Recommendations

In [76]:
user_positive_products = {
    '95013105ef3dd970bf8a7eb2904bd32e': 0.8,
    '92db7b3663df49f4225f1fdb38a264c4': 0.6,
    '11db1eae84c36eacd2f61eb00f7eb910': 0.7
}

In [77]:
def get_user_recommendations(user_actions, n=10):
    user_product_map = {}
    for product_id, user_score in user_actions.items():
        for related_id, related_score in related_products[product_id].items():
            user_product_map[related_id] = user_product_map.get(related_id, 0) + (related_score * user_score)
    for product_id in user_actions:
        if product_id in user_product_map:
            del user_product_map[product_id]
    user_recommendations = sorted(user_product_map.items(), key=lambda x: x[1], reverse=True)
    return user_recommendations[:min(n, len(user_recommendations))]

In [78]:
user_recommendations = get_user_recommendations(user_positive_products)

In [82]:
display_products([add_product_details(pid) for pid in user_positive_products.keys()])

VBox(children=(HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x0…

In [83]:
display_products([add_product_details(pid, score) for pid, score in user_recommendations])

VBox(children=(HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x0…