# Recommender system
In this notebook, we will leverage the trained Weighted Matrix Factorization model to make recommendations. 


In [2]:
from src.wmfact import WeightedMatrixFactorization
import numpy as np
import pandas as pd

# Load the model
model = WeightedMatrixFactorization.load('./models/wmf_wals_nlat270_niter16_lambdareg0.02.pkl')
#usr_emb, item_emb = model.get_embeddings() # the user and item embeddings

# Load the data
feedbacks = np.load('./data/feedbacks.npy') # the feedbacks matrix that was used to train the model
movies = pd.read_csv('data/movies.csv') # contains info about each movie 

In [4]:
model.get_embeddings()
#print(users_embedding.shape, items_embedding.shape)

SystemError: unknown opcode

### The cosine similarity metric
Cosine similarity is a metric that measures the cosine of the angle between two non-zero vectors in a multi-dimensional space; it computes the cosine of the angle between the two vectors, representing the direction of the vectors regardless of their magnitude.

$$ cosine\_similarity(A, B) = \frac{A \cdot B}{||A|| ||B||} $$

- $A \cdot B$ is the dot-product of the two vectors;
- $||A||$, $||B||$ are the Euclidian norms (magnitudes) of the two vectors.

The cosine similarity returns a value in $[-1,1]$; a value closer to 1 indicates greater similarity.

In [5]:
def cosine_similarity( vector1, vector2 ):
    """
    Calculate the cosine similarity between two vectors.
    Formula: similarity = (vector1 . vector2) / (||vector1|| * ||vector2||)
    
    Parameters:
    - vector1 (numpy.ndarray): The first vector.
    - vector2 (numpy.ndarray): The second vector.
    
    Returns:
    - similarity (float): The cosine similarity between the two vectors.
    """

    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = np.dot(vector1, vector2) / (norm_vector1 * norm_vector2)
    return similarity

### Content-based filtering
- Retrieve the user embedding vector corresponding to the user_id from the trained model. This vector represents the user's preferences in the latent space learned by the model.
- Determine the indices of items that the user has not rated. These indices represent the items that are potentially eligible for recommendation to the user.
- Calculate the cosine similarity between the user embedding vector and the embedding vectors of the unrated items. Cosine similarity measures the cosine of the angle between two vectors and quantifies how similar they are in direction.
- Sort the computed similarity scores in descending order to identify the items that are most similar to the user's preferences. These items are potential candidates for recommendation.
- Select the top-N items with the highest similarity scores to recommend to the user. These items are considered the most relevant or similar to the user's preferences.
- Return the top-N recommended items along with their similarity scores in a pandas DataFrame, where each row represents an item and includes information such as the item's title and the computed similarity percentage.

In [None]:
def contentbased_filtering(user_id, model, top_n=10):
    """
    Get the top-n items for a user using content-based filtering.

    Parameters:
    - user_id (int): The user id.
    - model (WeightedMatrixFactorization): The trained model.
    - top_n (int): The number of items to recommend.

    Returns:
    - top_k_movies (pandas.DataFrame): The top-k items.
    """

    # get the user and item embeddings:
    users_embedding, items_embedding = model.get_embeddings()

    # get the user embedding for the user_id:
    user_embedding = users_embedding[user_id]

    # get all the items that the user has not rated (the indices of the items):
    idx_not_rated = np.where( 
        np.isnan( feedbacks[user_id] )
    )[0]

    # compute the similarities between this user and the items that he has not rated:
    similarities = np.array([ 
        cosine_similarity(
            vector1 = user_embedding,           # the user embedding
            vector2 = items_embedding[idx]      # the item embedding of the idx-th item
        ) for idx in idx_not_rated              # iterate over non-rated items
    ])

    similarities = similarities * 100               # convert to %
    similarities = np.round( similarities, 2 )      # round 2 decimal places
    top_k = np.argsort(similarities)[::-1][:top_n]  # get the indices of the top-n items
    top_k_movies = movies.iloc[top_k]               # get the corresponding movies
    top_k_movies = top_k_movies.copy()
    top_k_movies['similarity (%)'] = similarities[top_k] # add the similarities to dataframe
    return top_k_movies     

As an example, now we can recommend top 10 movies for the user with id = 475:

In [None]:
top_k_movies = contentbased_filtering( user_id = 475, model = model, top_n = 10 )
print(top_k_movies)

### Collaborative filtering

In [9]:
def collaborative_filtering(user_id, model, top_n=10):
    """ 
    
    """

    users_embedding, items_embedding = model.get_embeddings()
    user_embedding = users_embedding[user_id]

    idx_not_rated = np.where( np.isnan(
    feedbacks[user_id]) 
    )[0]

    # similarity between this user and all the other users:
    similarities = np.array([ 
    cosine_similarity(
        vector1 = users_embedding[idx],
        vector2 = user_embedding
    ) for idx in range(users_embedding.shape[0])
    ])

    sorted_indices = np.argsort(similarities)[::-1][:int(0.10*len(similarities))]
    sorted_indices = sorted_indices[1:]  # Exclude the target user

    # Aggregate preferences of similar users
    aggregated_preferences = np.sum(users_embedding[sorted_indices], axis=0)

    # Calculate cosine similarity between the target user group and items based on content features
    similarities = np.array([ 
    cosine_similarity(
        vector1 = aggregated_preferences,
        vector2 = items_embedding[idx]
    ) for idx in idx_not_rated
    ])

    # sort items and similarities together according to content-based similarity in descending order
    sorted_indices = np.argsort(similarities)[::-1][:top_n]
    recommended_item_indices = idx_not_rated[sorted_indices]
    recommended_items_similarities = similarities[sorted_indices]

    # a dataframe with the recommended items and their similarities:
    recommended_items = movies.iloc[recommended_item_indices]
    recommended_items = recommended_items.copy()
    recommended_items['similarity'] = recommended_items_similarities

    return recommended_items

top_k_movies = collaborative_filtering(475, model, 20)
print(top_k_movies[['title', 'similarity']])

                                      title  similarity
173          Raiders of the Lost Ark (1981)    0.618102
63         Shawshank Redemption, The (1994)    0.589655
78                     Fugitive, The (1993)    0.582490
49                         Star Wars (1977)    0.579519
171         Empire Strikes Back, The (1980)    0.579427
175                           Aliens (1986)    0.569882
356  One Flew Over the Cuckoo's Nest (1975)    0.567194
11               Usual Suspects, The (1995)    0.565332
97         Silence of the Lambs, The (1991)    0.560144
180               Return of the Jedi (1983)    0.550040
143                         Die Hard (1988)    0.549425
194                  Terminator, The (1984)    0.548943
21                        Braveheart (1995)    0.545214
317                 Schindler's List (1993)    0.545053
650                            Glory (1989)    0.544800
95        Terminator 2: Judgment Day (1991)    0.541405
30                      Crimson Tide (1995)    0