# Content-based recommendation

In [1]:
# Load data generated in Session 1 or the provided data splits (see Absalon, W7 Lab)
import pandas as pd

df_train = pd.read_pickle("train_dataframe.pkl")
df_test = pd.read_pickle("test_dataframe.pkl")

# Exercise 1
In this exercise, you will build user profiles using TF-IDF vectors and use them to get the recommended items.

Based on the TF-IDF vectors obtained in the Exercise 2 from Week 09, represent each user in the same vector space. Amongst other feasible solutions, you can represent a user (creating  a user profile) by computing the weighted mean of the items vectors, from the items that have been rated by the users in the training set. Reflect on this way of creating the user profile; is there a better way to make use of low ratings?

For all users, compute the cosine similarity with each product that they have not rated in the training set  (**unobserved ratings**). Take the top-5 items with the highest cosine similarity as the top-5 recommended items. 

What are the top-5 recommended items for user `A39WWMBA0299ZF`? Print out the top-5 items for said user and their similarity score, rounded to three decimal places.  

In [2]:
import os
import pickle 
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from pprint import pprint
from typing import Dict, Any, List

# Load TF-IDF
with open('W09_tfidf.pickle', 'rb') as handle:
    X_tfidf = pickle.load(handle)
with open('W09_map_asin_id.pickle', 'rb') as handle:
    map_asin_id = pickle.load(handle)


In [7]:
X_tfidf

<84x471 sparse matrix of type '<class 'numpy.float64'>'
	with 763 stored elements in Compressed Sparse Row format>

In [None]:
def get_top_k_user_i(predictions: Dict[str, Dict[str,float]],
                     user_id: str, 
                     k: int) -> List[Any]:
    """Retrieve the top-K recommended items for a given user.
    Args:
        predictions: A dictionary containing the similarities 
            between users and items (e.g., keys are user ids, 
            values are dictionaries containing the similary 
            of the corresponding user to each item)
        user_id(str): The user we want to extract top-K recommendations for 
        k(int): The number of recommendation to output for each user.
    Returns:
    List of recommended items  
    """
    top_k = []
    
    
    # YOUR CODE HERE
    
    return top_k

In [11]:
map_asin_id

{'B0000530HU': 0,
 'B00006L9LC': 1,
 'B00021DJ32': 2,
 'B0002JHI1I': 3,
 'B0006O10P4': 4,
 'B0009RF9DW': 5,
 'B000FI4S1E': 6,
 'B000FOI48G': 7,
 'B000FTYALG': 8,
 'B000GLRREU': 9,
 'B000LIBUBY': 10,
 'B000NKJIXM': 11,
 'B000PKKAGO': 12,
 'B000URXP6E': 13,
 'B000V5Z4J6': 14,
 'B000VUXCGI': 15,
 'B000VV1YOY': 16,
 'B000W0C07Y': 17,
 'B000WR2HB6': 18,
 'B000WYJTZG': 19,
 'B000X2FPXC': 20,
 'B000X7ST9Y': 21,
 'B000YB70PS': 22,
 'B0010ZBORW': 23,
 'B00112DRHY': 24,
 'B0011FYB5I': 25,
 'B00120VWTK': 26,
 'B00126LYJM': 27,
 'B0012XPRO8': 28,
 'B0012Y0ZG2': 29,
 'B0013NB7DW': 30,
 'B0014SQQ3M': 31,
 'B00155Z6V2': 32,
 'B00157OBRU': 33,
 'B0017TZD7S': 34,
 'B001E5PLCM': 35,
 'B001E96LUO': 36,
 'B001ET7FZE': 37,
 'B001F51RAG': 38,
 'B001LNODUS': 39,
 'B001OHV1H4': 40,
 'B001QY8QXM': 41,
 'B002GP80EU': 42,
 'B002RZZXYE': 43,
 'B004CALFE4': 44,
 'B004KEJ65C': 45,
 'B006IB5T4W': 46,
 'B006WYJM8Y': 47,
 'B007R6UXNY': 48,
 'B007V6JNE0': 49,
 'B008YQM4A6': 50,
 'B0091OCA86': 51,
 'B00AKP21KM': 52,
 'B

In [20]:
np.array(X_tfidf[:,:10].toarray())

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.       

In [38]:
# calculating the user profile for each user
list_of_users_in_train_set = df_train.reviewerID.unique().tolist()
user_profiles = {}
for user in list_of_users_in_train_set:
    ratings_given_by_user = df_train[df_train.reviewerID==user].overall.to_numpy()
    items_rated = df_train[df_train.reviewerID==user].asin.to_numpy()
    
    items_not_rated_by_user = [map_asin_id[item] for item in set(df_train.asin.tolist()).difference(set(items_rated.tolist()))]
    
    item_id_in_TFIDF = [map_asin_id[item] for item in items_rated]
    tfidf_matrix_for_rated_items = np.array(X_tfidf[item_id_in_TFIDF].toarray())
    #    user_profile_matrix = np.hstack((tfidf_matrix_for_rated_items, np.array([ratings_given_by_user]).T))
    means = tfidf_matrix_for_rated_items.mean(axis=0)    
    mean_centered_tfidf_matrix = tfidf_matrix_for_rated_items - means[np.newaxis, :]
    
    user_profile = (mean_centered_tfidf_matrix * ratings_given_by_user[:, np.newaxis]) / ratings_given_by_user.shape[0]
    user_profiles[user] = (user_profile, items_not_rated_by_user)
    

In [32]:
user_profiles

{'A105A034ZG9EHO': array([[ 0.        ,  0.        ,  0.        , ...,  0.63787537,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ..., -0.31893768,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ..., -0.31893768,
          0.        ,  0.        ]]),
 'A10JB7YPWZGRF4': array([[ 0.        ,  0.        ,  0.        , ...,  0.63787537,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ..., -0.31893768,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ..., -0.31893768,
          0.        ,  0.        ]]),
 'A10M2MLE2R0L6K': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'A10P0NAKKRYKTZ': array([[ 0.        ,  0.        ,  0.        , ...,  0.63787537,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ..., -0.31893768,
          0.        ,  0.

In [None]:
user_profiles

In [40]:
[cosine_similarity(profile, X_tfidf[unobserved_movies_ids]) for profile, unobserved_movies_ids in user_profiles.items()]

ValueError: shape mismatch: objects cannot be broadcast to a single shape.  Mismatch is between arg 0 with shape (3, 471) and arg 1 with shape (56,).

In [None]:
K = 5
user_id = 'A39WWMBA0299ZF' 
topk = get_top_k_user_i( ) #fill this
print(f"Top-{K} recommended items for user '{user_id}':")
pprint(topk)

# Exercise 2



In this exercise, you will evaluate the content-based recommender system in Exercise 1.

Compute the hit rate for the content-based recommender system from Exercise 1. Evaluate the hit rate based on the top-5, top-10 and top-20 recommendations, averaged over the total number of users. Round your final answer to 3 decimal places. Remember that, as we are evaluating the system, you should compute the hit rate over the **test set**. How well/bad does this content-based approach perform compared to the collaborative filtering approaches?

In [None]:
q   
# YOUR CODE HERE

# Exercise 3

In this exercise, you will create a content-based recommender system based on word2vec embeddings and evaluate its performance with hit rate.

Repeat Exercise 1 and 2, this time representing the items and users in a word2vec vector space. You may use the gensim library and download the 300-dimension embeddings from Google. Source: https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

Remember to follow the same preprocessing pipeline as instructed in Lab W9, skipping the stemming step. Think on why we should not perform stemming when working with word2vec embeddings.

In [None]:
#uncomment and run the following line to install gensim
# !pip install gensim

In [None]:
import gensim.downloader
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')


# YOUR CODE FROM WEEK 09 HERE




In [None]:
# NEW CODE HERE 

# Represent items in the vector space by taking an average of the word embedding of the tokens in the item title
# Tokens that are (out-of-vocabulary) OOV may be skipped

# Compute user profiles

# Cosine similarities 


K = 5
user_id = 'A39WWMBA0299ZF'
topk = get_top_k_user_i( )
print(f"Top-{K} recommended items for user '{user_id}':")
pprint(topk)

In [None]:
# Get top-K for all users in the test set 

top_5 = {}
for user_id in test_users:
    top_5[user_id] = get_top_k_user_i( )
print("Hit Rate (top-5): {:.3f}".format(hit_rate(top_5, df_test)))
top_10 = {}
for user_id in test_users:
    top_10[user_id] = get_top_k_user_i( )
print("Hit Rate (top-10): {:.3f}".format(hit_rate(top_10, df_test)))
top_20 = {}
for user_id in test_users:
    top_20[user_id] = get_top_k_user_i( )
print("Hit Rate (top-20): {:.3f}".format(hit_rate(top_20, df_test)))