# Recommendation System Using Weighted Matrix Factorization Model
In this notebook, we will leverage the trained Weighted Matrix Factorization model to make recommendations. In particular, we will use two different approaches: Collaborative Filtering and Content-Based Filtering.


In [3]:
from src.wmfact import WeightedMatrixFactorization
import numpy as np
import pandas as pd

model = WeightedMatrixFactorization.load('./models/wmf_wals_20240206102010.pkl') # the trained model
#usr_emb, item_emb = model.get_embeddings() # the user and item embeddings

feedbacks = np.load('./data/feedbacks.npy') # the feedbacks matrix that was used to train the model
movies = pd.read_csv('data/movies.csv') # contains info about each movie 

### The cosine similarity metric
Cosine similarity is a metric that measures the cosine of the angle between two non-zero vectors in a multi-dimensional space; it computes the cosine of the angle between the two vectors, representing the direction of the vectors regardless of their magnitude.

$$ cosine\_similarity(A, B) = \frac{A \cdot B}{||A|| ||B||} $$

- $A \cdot B$ is the dot-product of the two vectors;
- $||A||$, $||B||$ are the Euclidian norms (magnitudes) of the two vectors.

The cosine similarity returns a value in $[-1,1]$; a value closer to 1 indicates greater similarity.

In [4]:
def cosine_similarity(vector1, vector2):
    """
    Calculate the cosine similarity between two vectors.
    Formula: similarity = (vector1 . vector2) / (||vector1|| * ||vector2||)
    
    Parameters:
    - vector1 (numpy.ndarray): The first vector.
    - vector2 (numpy.ndarray): The second vector.
    
    Returns:
    - similarity (float): The cosine similarity between the two vectors.
    """

    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = np.dot(vector1, vector2) / (norm_vector1 * norm_vector2)
    return similarity

### Content-based filtering
- Retrieve the user embedding vector corresponding to the user_id from the trained model. This vector represents the user's preferences in the latent space learned by the model.
- Determine the indices of items that the user has not rated. These indices represent the items that are potentially eligible for recommendation to the user.
- Calculate the cosine similarity between the user embedding vector and the embedding vectors of the unrated items. Cosine similarity measures the cosine of the angle between two vectors and quantifies how similar they are in direction.
- Sort the computed similarity scores in descending order to identify the items that are most similar to the user's preferences. These items are potential candidates for recommendation.
- Select the top-N items with the highest similarity scores to recommend to the user. These items are considered the most relevant or similar to the user's preferences.
- Return the top-N recommended items along with their similarity scores in a pandas DataFrame, where each row represents an item and includes information such as the item's title and the computed similarity percentage.

In [18]:
def contentbased_filtering(user_id, model, top_n=10):
    """
    Get the top-n items for a user using content-based filtering.

    Parameters:
    - user_id (int): The user id.
    - model (WeightedMatrixFactorization): The trained model.
    - top_n (int): The number of items to recommend.

    Returns:
    - top_k_movies (pandas.DataFrame): The top-k items.
    """
    users_embedding, items_embedding = model.get_embeddings()

    # get the user embedding for the user_id:
    user_embedding = users_embedding[user_id]

    # get all the items that the user has not rated (the indices of the items):
    idx_not_rated = np.where( np.isnan(
        feedbacks[user_id]
    ) )[0]

    # compute the similarities between this user and the items that he has not rated:
    similarities = np.array([ 
        cosine_similarity(
            vector1 = user_embedding,
            vector2 = items_embedding[idx]
        ) for idx in idx_not_rated
    ])

    similarities = similarities * 100
    similarities = np.round(similarities, 2)

    # order the similarities in descending order and get the top-n items (indices):
    top_k = np.argsort(similarities)[::-1][:top_n]

    # get the corresponding movies:
    top_k_movies = movies.iloc[top_k]

    # add also the similarity to the dataframe:
    top_k_movies = top_k_movies.copy()
    top_k_movies['similarity (%)'] = similarities[top_k]
    return top_k_movies

user_id = 475
top_k_movies = contentbased_filtering(user_id, model, top_n=20)
print(top_k_movies[['title', 'similarity (%)']])

                                       title  similarity (%)
135      Mr. Smith Goes to Washington (1939)           39.18
923                      White Squall (1996)           37.66
523               Great Dictator, The (1940)           34.96
353               Wedding Singer, The (1998)           34.70
144                Lawnmower Man, The (1992)           34.30
487                      Sunset Blvd. (1950)           34.20
134             2001: A Space Odyssey (1968)           33.21
1100        Six Degrees of Separation (1993)           33.19
729   Queen Margot (Reine Margot, La) (1994)           32.86
43                  Dolores Claiborne (1994)           32.55
172               Princess Bride, The (1987)           32.32
601             American in Paris, An (1951)           32.29
71                          Mask, The (1994)           32.20
469                         Tombstone (1993)           32.10
154                     Dirty Dancing (1987)           32.06
416                  Par

### Collaborative filtering

In [41]:
def collaborative_filtering(user_id, model, top_n=10):
    """ 
    
    """

    users_embedding, items_embedding = model.get_embeddings()
    user_embedding = users_embedding[user_id]

    idx_not_rated = np.where( np.isnan(
    feedbacks[user_id]) 
    )[0]

    # similarity between this user and all the other users:
    similarities = np.array([ 
    cosine_similarity(
        vector1 = users_embedding[idx],
        vector2 = user_embedding
    ) for idx in range(users_embedding.shape[0])
    ])

    sorted_indices = np.argsort(similarities)[::-1][:int(0.10*len(similarities))]
    sorted_indices = sorted_indices[1:]  # Exclude the target user

    # Aggregate preferences of similar users
    aggregated_preferences = np.sum(users_embedding[sorted_indices], axis=0)

    # Calculate cosine similarity between the target user group and items based on content features
    similarities = np.array([ 
    cosine_similarity(
        vector1 = aggregated_preferences,
        vector2 = items_embedding[idx]
    ) for idx in idx_not_rated
    ])

    # sort items and similarities together according to content-based similarity in descending order
    sorted_indices = np.argsort(similarities)[::-1][:top_n]
    recommended_item_indices = idx_not_rated[sorted_indices]
    recommended_items_similarities = similarities[sorted_indices]

    # a dataframe with the recommended items and their similarities:
    recommended_items = movies.iloc[recommended_item_indices]
    recommended_items = recommended_items.copy()
    recommended_items['similarity'] = recommended_items_similarities

    return recommended_items


In [43]:
top_k_movies = collaborative_filtering(475, model, 20)
print(top_k_movies[['title', 'similarity']])

                                      title  similarity
173          Raiders of the Lost Ark (1981)    0.605878
63         Shawshank Redemption, The (1994)    0.559251
78                     Fugitive, The (1993)    0.553518
171         Empire Strikes Back, The (1980)    0.551471
175                           Aliens (1986)    0.549505
49                         Star Wars (1977)    0.543991
356  One Flew Over the Cuckoo's Nest (1975)    0.540971
11               Usual Suspects, The (1995)    0.539903
97         Silence of the Lambs, The (1991)    0.539591
143                         Die Hard (1988)    0.531950
194                  Terminator, The (1984)    0.529334
21                        Braveheart (1995)    0.519917
27                         Apollo 13 (1995)    0.519707
650                            Glory (1989)    0.513006
180               Return of the Jedi (1983)    0.512904
195               Dead Poets Society (1989)    0.512353
317                 Schindler's List (1993)    0