# Content-based recommendation

In [6]:
# Load data generated in Session 1 or the provided data splits (see Absalon, W7 Lab)
import pandas as pd

df_train = pd.read_pickle("train_dataframe.pkl")
df_test = pd.read_pickle("test_dataframe.pkl")

# Exercise 1
In this exercise, you will build user profiles using TF-IDF vectors and use them to get the recommended items.

Based on the TF-IDF vectors obtained in the Exercise 2 from Week 09, represent each user in the same vector space. Amongst other feasible solutions, you can represent a user (creating  a user profile) by computing the weighted mean of the items vectors, from the items that have been rated by the users in the training set. Reflect on this way of creating the user profile; is there a better way to make use of low ratings?

For all users, compute the cosine similarity with each product that they have not rated in the training set  (**unobserved ratings**). Take the top-5 items with the highest cosine similarity as the top-5 recommended items. 

What are the top-5 recommended items for user `A39WWMBA0299ZF`? Print out the top-5 items for said user and their similarity score, rounded to three decimal places.  

In [119]:
import os
import pickle 
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from pprint import pprint
from typing import Dict, Any, List

# Load TF-IDF
with open('W09_tfidf.pickle', 'rb') as handle:
    X_tfidf = pickle.load(handle)
with open('W09_map_asin_id.pickle', 'rb') as handle:
    map_asin_id = pickle.load(handle)


In [120]:
df_train.reviewerID.unique().shape

(981,)

In [121]:
df_test.reviewerID.unique().shape

(949,)

In [122]:
def get_top_k_user_i(predictions: Dict[str, Dict[str,float]],
                     user_id: str, 
                     k: int) -> List[Any]:
    """Retrieve the top-K recommended items for a given user.
    Args:
        predictions: A dictionary containing the similarities 
            between users and items (e.g., keys are user ids, 
            values are dictionaries containing the similary 
            of the corresponding user to each item)
        user_id(str): The user we want to extract top-K recommendations for 
        k(int): The number of recommendation to output for each user.
    Returns:
    List of recommended items  
    """
    return sorted(predictions[user_id], key=lambda x: x[1], reverse=True)[:k]


In [123]:
rated_item_id_in_TFIDF

[5, 6, 13]

In [124]:
X_tfidf[rated_item_id_in_TFIDF]

<3x471 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [125]:
# calculating the user profile for each user
list_of_users_in_train_set = df_train.reviewerID.unique().tolist()
user_profiles = {}
unrated_user_items = {}
for user in list_of_users_in_train_set:
    ratings_given_by_user = df_train[df_train.reviewerID==user]
    items_rated_by_user = ratings_given_by_user.asin.to_numpy()

    rated_item_id_in_TFIDF = [map_asin_id[item] for item in items_rated_by_user]
    
    tfidf_embedding_for_rated_items = np.array(X_tfidf[rated_item_id_in_TFIDF].toarray())

    # center the tfidf features using per feature mean
    #means = tfidf_embedding_for_rated_items.mean(axis=0)    
    mean_centered_tfidf_matrix = tfidf_embedding_for_rated_items #- means[np.newaxis, :]
    
    rating_values_for_items = ratings_given_by_user.overall.to_numpy()
    user_profile = np.mean(mean_centered_tfidf_matrix * rating_values_for_items[:, np.newaxis], axis=0)
    user_profiles[user] = user_profile
    unrated_user_items[user] = (set(df_train.asin.tolist()) | set(df_test.asin.tolist())).difference(set(items_rated_by_user.tolist()))
    
    

In [126]:
per_user_recommendation_predictions = {}
for user in user_profiles.keys():
    user_profile, unrated_items = user_profiles[user], unrated_user_items[user]
    unrated_items_tfidf_id = [map_asin_id[item] for item in unrated_items]
    result = cosine_similarity(user_profile.reshape(1,-1), X_tfidf[unrated_items_tfidf_id])
    per_user_recommendation_predictions[user] = sorted(list(zip(unrated_items,result[0])), key=lambda x: x[1], reverse=True)
    

In [127]:
K = 5
user_id = 'A39WWMBA0299ZF' 
topk = get_top_k_user_i(per_user_recommendation_predictions, user_id, K) #fill this
print(f"Top-{K} recommended items for user '{user_id}':")
pprint(topk)

Top-5 recommended items for user 'A39WWMBA0299ZF':
[('B019FWRG3C', 0.41890779509762005),
 ('B00W259T7G', 0.1904341366115),
 ('B00IJHY54S', 0.0877187842318),
 ('B0006O10P4', 0.08135119741224563),
 ('B00006L9LC', 0.07867821553522243)]


# Exercise 2



In this exercise, you will evaluate the content-based recommender system in Exercise 1.

Compute the hit rate for the content-based recommender system from Exercise 1. Evaluate the hit rate based on the top-5, top-10 and top-20 recommendations, averaged over the total number of users. Round your final answer to 3 decimal places. Remember that, as we are evaluating the system, you should compute the hit rate over the **test set**. How well/bad does this content-based approach perform compared to the collaborative filtering approaches?

In [128]:
def calculate_hit_rate(recommendations_per_user, df_test, k):
    accum_hits = 0
    for user, recommendations in recommendations_per_user.items():
        for i in range(min(len(recommendations),k)):
            item_name = recommendations[i][0]
            test_rating = df_test[(df_test.asin==item_name) & (df_test.reviewerID == user) & (df_test.overall >= 3)]
            if test_rating.shape[0] > 0:
                accum_hits += 1
                break
    
    return accum_hits/len(recommendations_per_user.keys())

In [129]:
hit5 = calculate_hit_rate(per_user_recommendation_predictions, df_test, 5)
hit10 = calculate_hit_rate(per_user_recommendation_predictions, df_test, 10)
hit20 = calculate_hit_rate(per_user_recommendation_predictions, df_test, 20)

In [130]:
print(f"Hit@5: {hit5}")
print(f"Hit@10: {hit10}")
print(f"Hit@20: {hit20}")

Hit@5: 0.4057084607543323
Hit@10: 0.4332313965341488
Hit@20: 0.49337410805300713


# Exercise 3

In this exercise, you will create a content-based recommender system based on word2vec embeddings and evaluate its performance with hit rate.

Repeat Exercise 1 and 2, this time representing the items and users in a word2vec vector space. You may use the gensim library and download the 300-dimension embeddings from Google. Source: https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

Remember to follow the same preprocessing pipeline as instructed in Lab W9, skipping the stemming step. Think on why we should not perform stemming when working with word2vec embeddings.

In [None]:
#uncomment and run the following line to install gensim
#!pip install gensim

In [63]:
import gensim.downloader
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')

meta_file = 'meta_All_Beauty.json'

import os
import sys
sys.path.append('../')
import pickle
import pandas as pd

# Load the metadata (items)
item_metadata = pd.read_json("meta_All_Beauty.json", lines=True)
all_rated_items_set = set(df_train.asin.tolist() + df_test.asin.tolist())

# Discard items that weren't rated by our subset of users
filtered_item_metadata = item_metadata[item_metadata.asin.isin(all_rated_items_set)].drop_duplicates(['asin', 'title'])
item_titles = list(filtered_item_metadata[["asin", "title"]].itertuples(index=False, name=None))

# YOUR CODE FROM WEEK 09 HERE


In [104]:
item_titles

[('B0000530HU', 'Aqua Velva After Shave, Classic Ice Blue, 7 Ounce'),
 ('B00006L9LC', 'Citre Shine Moisture Burst Shampoo - 16 fl oz'),
 ('B00021DJ32', 'NARS Blush, Taj Mahal'),
 ('B0002JHI1I',
  'Avalon Organics Wrinkle Therapy CoQ10 Cleansing Milk, 8.50 oz'),
 ('B0006O10P4', 'ZUM Zum Bar Anise Lavender, 3 Ounce'),
 ('B0009RF9DW',
  'Yardley By Yardley Of London Unisexs Lay It On Thick Hand &amp; Foot Cream 5.3 Oz'),
 ('B000FI4S1E',
  'Fruits &amp; Passion Blue Refreshing Shower Gel - 6.7 fl. oz.'),
 ('B000FOI48G', 'Waterpik Ultra Water Flosser'),
 ('B000FTYALG', 'Aqua Velva After Shave, Classic Ice Blue, 3.5 Ounce'),
 ('B000GLRREU', 'Waterpik Ultra Water Flosser'),
 ('B000LIBUBY', 'Fresh Eau de Parfum, Sugar Lemon, 3.4 oz'),
 ('B000NKJIXM',
  'Crest Pro-health Multi-Protection Rinse, Cool Wintergreen, 33.8 Fluid Ounce'),
 ('B000PKKAGO', "Philips Norelco arcitec 1090 Men's Shaving System"),
 ('B000URXP6E', 'Bonne Bell Smackers Bath and Body Starburst Collection'),
 ('B000V5Z4J6', 'Phi

In [105]:
#!pip install nltk

In [106]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string


[nltk_data] Downloading package punkt to
[nltk_data]     /home/danielpenchev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/danielpenchev/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/danielpenchev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [107]:
def preprocessing(text):
    lower_case_corpus = text.lower()
    tokenized_corpus = word_tokenize(lower_case_corpus)
    stops_words_english = set(stopwords.words('english')) | set(string.punctuation)
    return " ".join([token for token in tokenized_corpus if token not in stops_words_english])

processed_titles = [(item, preprocessing(title)) for item, title in item_titles]


In [108]:
processed_titles

[('B0000530HU', 'aqua velva shave classic ice blue 7 ounce'),
 ('B00006L9LC', 'citre shine moisture burst shampoo 16 fl oz'),
 ('B00021DJ32', 'nars blush taj mahal'),
 ('B0002JHI1I',
  'avalon organics wrinkle therapy coq10 cleansing milk 8.50 oz'),
 ('B0006O10P4', 'zum zum bar anise lavender 3 ounce'),
 ('B0009RF9DW',
  'yardley yardley london unisexs lay thick hand amp foot cream 5.3 oz'),
 ('B000FI4S1E', 'fruits amp passion blue refreshing shower gel 6.7 fl oz'),
 ('B000FOI48G', 'waterpik ultra water flosser'),
 ('B000FTYALG', 'aqua velva shave classic ice blue 3.5 ounce'),
 ('B000GLRREU', 'waterpik ultra water flosser'),
 ('B000LIBUBY', 'fresh eau de parfum sugar lemon 3.4 oz'),
 ('B000NKJIXM',
  'crest pro-health multi-protection rinse cool wintergreen 33.8 fluid ounce'),
 ('B000PKKAGO', "philips norelco arcitec 1090 men 's shaving system"),
 ('B000URXP6E', 'bonne bell smackers bath body starburst collection'),
 ('B000V5Z4J6', 'philips sonicare uv sanitizer'),
 ('B000VUXCGI', "mag

In [109]:
# Represent items in the vector space by taking an average of the word embedding of the tokens in the item title
# Tokens that are (out-of-vocabulary) OOV may be skipped

title_to_embedding = {}
for item, title in processed_titles:
    in_vocab_words = 0
    sentence_embedding = np.zeros((300,))
    for word in title.split(" "):
        if word not in word2vec_vectors.key_to_index:
            continue
        sentence_embedding += word2vec_vectors[word]
        in_vocab_words+=1
    
    sentence_embedding /= max(in_vocab_words,1)
    title_to_embedding[item] = sentence_embedding

In [110]:
# Compute user profiles


# calculating the user profile for each user
list_of_users_in_train_set = df_train.reviewerID.unique().tolist()
user_profiles = {}
unrated_user_items = {}
for user in list_of_users_in_train_set:
    ratings_given_by_user = df_train[df_train.reviewerID==user]
    items_rated_by_user = ratings_given_by_user.asin.to_numpy()

    item_embeddings = np.array([title_to_embedding[item] for item in items_rated_by_user])

    # center the tfidf features using per feature mean
    #means = item_embeddings.mean(axis=0)    
    mean_centered_item_embeddings= item_embeddings #- means[np.newaxis, :]
    
    rating_values_for_items = ratings_given_by_user.overall.to_numpy()
    user_profile = np.mean(mean_centered_item_embeddings * rating_values_for_items[:, np.newaxis], axis=0)
    user_profiles[user] = user_profile
    unrated_user_items[user] = (set(df_train.asin.tolist()) | set(df_test.asin.tolist())).difference(set(items_rated_by_user.tolist()))
    
    

In [111]:
per_user_recommendation_predictions = {}
for user in user_profiles.keys():
    user_profile, unrated_items = user_profiles[user], unrated_user_items[user]
    unrated_items_embeddings = [title_to_embedding[item] for item in unrated_items]
    result = cosine_similarity(user_profile.reshape(1,-1), unrated_items_embeddings)
    per_user_recommendation_predictions[user] = sorted(list(zip(unrated_items,result[0])), key=lambda x: x[1], reverse=True)
    

In [112]:
# NEW CODE HERE 

K = 5
user_id = 'A39WWMBA0299ZF'
topk = get_top_k_user_i(per_user_recommendation_predictions, user_id, K) #fill this
print(f"Top-{K} recommended items for user '{user_id}':")
pprint(topk)

Top-5 recommended items for user 'A39WWMBA0299ZF':
[('B000LIBUBY', 0.7818762363300937),
 ('B019FWRG3C', 0.7803233280348185),
 ('B000W0C07Y', 0.7674191081068749),
 ('B0012XPRO8', 0.7576563059459733),
 ('B00HLXEXDO', 0.7472139124796926)]


In [115]:
# Get top-K for all users in the test set 

top_5 = {}

test_users = df_test.reviewerID.unique().tolist()

for user_id in test_users:
    top_5[user_id] = get_top_k_user_i(per_user_recommendation_predictions, user_id, 5)
print("Hit Rate (top-5): {:.3f}".format(calculate_hit_rate(top_5, df_test, 5)))
top_10 = {}
for user_id in test_users:
    top_10[user_id] = get_top_k_user_i(per_user_recommendation_predictions, user_id, 10)
print("Hit Rate (top-10): {:.3f}".format(calculate_hit_rate(top_10, df_test, 10)))
top_20 = {}
for user_id in test_users:
    top_20[user_id] = get_top_k_user_i(per_user_recommendation_predictions, user_id, 20)
print("Hit Rate (top-20): {:.3f}".format(calculate_hit_rate(top_20, df_test, 20)))

Hit Rate (top-5): 0.412
Hit Rate (top-10): 0.427
Hit Rate (top-20): 0.490
