In [None]:
import gzip
import json
import pandas as pd
from pprint import pprint
from tqdm import tqdm
import numpy as np

import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess

import pickle
from numpy.linalg import norm

In [None]:
# Filter products with atleast 15 reviews. USe the processed csv dumped earlier rather than reading the whole data again.
df = pd.read_csv('office.csv', index_col=0)

In [None]:
df.shape[0], df.user_id.nunique(), df.asin.nunique()

Over 90K users, 10k products with 0.58M ratings. We will filter our metadata to these 10k asins only.

In [None]:
asins_of_interest = set(df.asin.unique())

I will use the product description text as a basis for establishing product profiles. No user/rating data is used at this stage.

In [None]:
with gzip.open(r"F:\work\is590ml_final\data\meta_Office_Products.json.gz", 'rt', encoding='utf-8') as f:
    corpus = {}
    n_empty = 0
    for line in f:
        prod = json.loads(line)
        desc = ' '.join(prod.get('description', '')).strip()
        if desc:
            if prod['asin'] in asins_of_interest:
                corpus[prod['asin']] = desc
        else:
            n_empty += 1


The above cell loads all descriptions into corpus dictionary keyed by the asin. We note that some products do not have a description. For the rest of the analysis these products are ignored for recommendations.

In [None]:
len(corpus)/len(asins_of_interest)

Only 9% of products do not have a description

In [None]:
def iter_file():
    with gzip.open(r"F:\work\is590ml_final\data\meta_Office_Products.json.gz", 'rt', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line)

In [None]:
a = iter_file()

In [None]:
with gzip.open(r"F:\work\is590ml_final\data\meta_Office_Products.json.gz", 'rt', encoding='utf-8') as f:
    titles = {}
    also_buy = {}
    n_empty = 0
    for line in f:
        prod = json.loads(line)
        desc = ' '.join(prod.get('description', '')).strip()
        if desc:
            if prod['asin'] in asins_of_interest:
                titles[prod['asin']] = prod['title']
                also_buy[prod['asin']] = prod.get('also_buy', [])
        else:
            n_empty += 1

In [None]:
# with open('review_corpus.pickle', 'wb') as f:
#     pickle.dump(corpus, f)

In [None]:
def read_corpus(corpus):
    """Helper function for postprocessing product description and tagged with asins."""
    for asin, desc in corpus.items():
        tokens = gensim.utils.simple_preprocess(desc)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [asin])

In [None]:
train_corpus = list(read_corpus(corpus))

In [None]:
train_corpus[:2]

We want to create an embedding for each product by considering each product description as a document. The traditional way of doing this is using TF-IDF or LSI. However, since we are dealing with products, I have attempted to use Doc2Vec here (an offshoot of Word2Vec). The advantage of using Doc2Vec is we get an embedding of the whole document (unlike Word2Vec) at once with the nice property that documents pertaining to the same topics have embeddings that are close to each other (parallel to Word2Vec). This way, product profiles for closeby products will be close to each other. As a first pass, I choose 50 dimensions for the embedding and ignore words which do not appear at least twice in the corpus.

Since Doc2Vec is based on Word2Vec, it is actually important that stopwords are not removed.

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=75, min_count=2, epochs=40)

In [None]:
model.build_vocab(train_corpus)

Training the doc2vec model. Should not take long if BLAS is installed. We have around 9K documents with a around 50 words each.

In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

Sanity check. I now have a model that can maps a product to an embedding. It should stand that a document (product) embedding should actually be closest to itself rather than other documents (product). However, given the model building mechanism of doc2vec, this might not be the case always. As a sanity check, I check how often this is true.

In [None]:
ranks = []
first_ranks = []
for doc in tqdm(train_corpus):
    inferred_vector = model.infer_vector(doc.words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [asin for asin, sim in sims].index(doc.tags[0])
    ranks.append(rank)
    first_ranks.append([doc.tags[0], *sims[0]])

In [None]:
first_ranks[:10]

In [None]:
sum(1 for i in first_ranks if i[0] != i[1])/len(first_ranks)

Our embedding model is 85% succesful in distinguishing documents. Frankly, this is way better than I expected given how sparse the description is for many products. Also, I might need to do some CV to figure out the ideal embedding space dimension and training epochs. For now, we collect the product profiles or their 50dimensional embeddings.

In [None]:
product_profiles = {}
for doc in train_corpus:
    product_profiles[doc.tags[0]] = model.infer_vector(doc.words)

We have the product profiles. Now we need to represent individual user preferences or the user profile. Since we do not have any background data on the user, we will model the user based on the ratings given.

A rating greater than 3 for product implies that the user has liked the product. So our user profile will be oriented towards the product
A rating less than 2 implies the user dislikes the product. So our user profile will be oriented away from the product.
A rating of 3 is no particular preference and does not influence the user profile.

With these assumptions, I can model the user preferences in the same vector space as the product embedding. User preferences are the weighted sum of their purchased product profiles with centered ratings as the weights. No normalization is done as cosine similarity is going to be used to align products with user preferences.


In [None]:
# center the ratings to use as weight
df.loc[:, 'rating_weight'] = df.rating - 3

In [None]:
# loop through each user, asin and rating tuple and update user profiles as you go. 
# If a product does not have a description, it does not get a product profile and does not contribute
# to user profiles

user_profiles = {}
for row in df.itertuples():
    user_profiles[row.user_id] = user_profiles.get(row.user_id, np.zeros(50)) + product_profiles.get(row.asin, np.zeros(50)) * row.rating_weight

In [None]:
user_profiles['A398INYG0ZBUZB']

# manual testing

In [None]:
# user under test
user_id = 'A1NK4TLIMODCTN'


In [None]:
# print users given ratings when titles exists
for row in df.loc[df.user_id == user_id].itertuples():
    try:
        print(row.asin, titles[row.asin], row.rating)
    except:
        pass

In [None]:
# print the top 25 recommendations from our model.

# user profile
u = user_profiles['A1NK4TLIMODCTN']

for sim in model.docvecs.most_similar([u], topn=25):
    
    # some titles are not clean but rather html. To avoid clusttering the output suppress them using a simple len check.
    if len(titles[sim[0]]) > 250:
        continue
    
    print(sim[0], titles[sim[0]], sim[1])

This has been more succesful than I expected it to be. The user preferred a brother wireless printer, and our recommender has succesfully pointed out related printers (even trying to upsell higher end models). More impressively, it has recommended toner as well. Similarly, I see a lot of stationary recommendations based on the user purchases. Especially impressive is the the Noodler's ink recommendation since the user has only bought one fountain pen.

This is impressive for a content based recommender, because all the product semantics were derived from the description only. One can certainly see how this avoids the cold start problem. If the description is detailed enough, this recommender can certainly pick it up. 

# Metrics

In [None]:
# atleast 4 reviews
df2 = df.groupby('user_id').filter(lambda x: len(x) > 4)

We withhold 20% of the ratings for each user as test data.

In [None]:
test_set_5star = (df2
            .query('rating > 4')
            .groupby('user_id')
            .apply(lambda x: x.sample(frac=0.2, random_state=0)))

In [None]:
test_set_5star.reset_index(level=0, drop=True, inplace=True)

In [None]:
train_set_5star = df2.loc[df2.index.difference(test_set.index)].copy()

In [None]:
train_set_5star.loc[:, 'rating_weight'] = train_set_5star.rating - 3

In [None]:
# loop through each user, asin and rating tuple and update user profiles as you go. 
# If a product does not have a description, it does not get a product profile and does not contribute
# to user profiles

user_profiles_train = {}
for row in train_set_5star.itertuples():
    user_profiles_train[row.user_id] = user_profiles_train.get(row.user_id, np.zeros(75)) + product_profiles.get(row.asin, np.zeros(75)) * row.rating_weight

In [None]:
len(user_profiles_train)

Let us start by giving the top 20 recommendations for each user based on their training user profile which withholds 20% of their 5 star reviews.

In [None]:
user_purchases = train_set.groupby('user_id').asin.unique()

In [None]:
reco = {}
for u, up in tqdm(user_profiles_train.items()):
    # get 1000 recos and drop already purchased items
    purchases = user_purchases.loc[u]
    recos = [p for i,p in 
             enumerate(filter(lambda x: x[0] not in purchases, model.docvecs.most_similar([up], topn=220)))
             if i < 20]
    reco[u] = recos
                   
    
    

In [None]:
def apk(actual, predicted, k=20):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


In [None]:
ap_scores = {}
for u in tqdm(test_set_5star.user_id.unique()):
    actual = test_set_5star.loc[(test_set_5star.user_id == u), 'asin'].tolist()
    predicted = [prod for prod, score in reco[u]]
    ap_scores[u] = apk(actual, predicted, k=20)

In [None]:
sum(ap_scores.values())/len(ap_scores)

# predict ratings for test set

In [None]:
def predicted_rating(user, product):
    pp = product_profiles[product]
    
    prods, ratings = [], []
    for row in train_set.loc[train_set.user_id==user].itertuples():
        try:
            prods.append(product_profiles[row.asin])
        except Exception:
            continue
        ratings.append(row.rating)
        
    
    return ratings[most_similar_to(pp, prods)]
            
    
    
    

In [None]:
def most_similar_to(vec, vec_list):
    similarity = -1
    nv = norm(vec)
    index = None
    for i, vec2 in enumerate(vec_list):
        sim = cosine_similarity([vec], [vec2])
        if sim > similarity:
            similarity = sim
            index = i
    return index

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
test_set.head()

In [None]:
pred_ratings = {}
for row in tqdm(test_set.itertuples()):
    try:
        pred_ratings[row.Index] = predicted_rating(row.user_id, row.asin)
    except KeyError:
        continue

In [None]:
pred_ratings = pd.Series(pred_ratings)

In [None]:
rmse = np.sqrt(np.sum(np.square((test_set - pred_ratings))))/test_set.shape[0]