# Recommender system
In this notebook, we will leverage the trained Weighted Matrix Factorization model to make recommendations. 


In [2]:
from src.wmfact import WeightedMatrixFactorization
import numpy as np
import pandas as pd

# Load the model
model = WeightedMatrixFactorization.load('./models/wmf_wals_nlat270_niter12_lambdareg0.02.pkl')

# Load the data
feedbacks = np.load('./data/feedbacks.npy') # the feedbacks matrix that was used to train the model
movies = pd.read_csv('data/raw/movies.csv') # contains info about each movie 

### Cosine similarity
Cosine similarity is a metric used to measure the similarity between two vectors in a multi-dimensional space. It calculates the cosine of the angle between the vectors, indicating the direction of similarity regardless of their magnitude.

The formula for cosine similarity between vectors $A$ and $B$ is:

$$ cosine\_similarity(A, B) = \frac{A \cdot B}{||A|| \times ||B||} $$

- $A \cdot B$ is the dot-product of the two vectors;
- $||A||$ and $||B||$ are the Euclidian norms (magnitudes) of the two vectors.

The cosine similarity value lies in the range $[−1,1]$, where a value closer to $1$ indicates higher similarity.

In [3]:
def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray) -> float:
    """
    Calculate the cosine similarity between two vectors.
    Formula: similarity = (vector1 . vector2) / (||vector1|| * ||vector2||)
    
    Parameters:
    - vector1 (numpy.ndarray): The first vector.
    - vector2 (numpy.ndarray): The second vector.
    
    Returns:
    - similarity (float): The cosine similarity between the two vectors.

    Raises:
    - ValueError: If one of the input vectors has a norm of 0.

    Usage:
    >>> similarity = cosine_similarity(vector1, vector2)
    """
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    
    if norm_vector1 == 0 or norm_vector2 == 0:
        raise ValueError("One of the vectors has a norm of 0.")
    
    similarity = np.dot(vector1, vector2) / (norm_vector1 * norm_vector2)
    return similarity

### Content-based filtering

In [4]:
def contentbased_filtering(user_id: int, model: WeightedMatrixFactorization, top_n: int=10) -> pd.DataFrame:
    """
    Get the top-n items for a user using content-based filtering.

    Parameters:
    - user_id (int): The user ID.
    - model (WeightedMatrixFactorization): The trained model.
    - top_n (int): The number of items to recommend. Default is 10.

    Returns:
    - top_k_movies (pandas.DataFrame): The top-k items recommended for the user.

    Raises:
    - ValueError: If the provided user ID is invalid or if it's not present in the model.
    - ValueError: If the top-n parameter is less than or equal to 0.

    Usage:
    >>> top_k_movies = contentbased_filtering(user_id, model, top_n=10)
    """
    
    if not isinstance(user_id, int):
        raise ValueError("User ID must be an integer.")
    
    if user_id < 0 or user_id >= model.n_users:
        raise ValueError("Invalid user ID. User ID must be within the range of existing users.")

    if not isinstance(top_n, int) or top_n <= 0:
        raise ValueError("Top-n parameter must be a positive integer.")

    # get the user and item embeddings:
    users_embedding, items_embedding = model.get_embeddings()

    # get the user embedding for the user_id:
    user_embedding = users_embedding[user_id]

    # get all the items that the user has not rated (the indices of the items):
    idx_not_rated = np.where( 
        np.isnan( feedbacks[user_id] )
    )[0]

    # compute the similarities between this user and the items that he has not rated:
    similarities = np.array([ 
        cosine_similarity(
            vector1 = user_embedding,           # the user embedding
            vector2 = items_embedding[idx]      # the item embedding of the idx-th item
        ) for idx in idx_not_rated              # iterate over non-rated items
    ])

    similarities = similarities * 100               # convert to %
    similarities = np.round( similarities, 2 )      # round 2 decimal places
    top_k = np.argsort(similarities)[::-1][:top_n]  # get the indices of the top-n items
    top_k_movies = movies.iloc[top_k]               # get the corresponding movies
    top_k_movies = top_k_movies.copy()
    top_k_movies['similarity (%)'] = similarities[top_k] # add the similarities to dataframe
    return top_k_movies     

We can now recommend top 10 movies for the user with id = 475 using **Content Based** filtering:

In [5]:
top_k_movies = contentbased_filtering( user_id = 475, model = model, top_n = 10 )
print(top_k_movies)

      movie_id                                   title  similarity (%)
923        924                     White Squall (1996)           40.32
135        136     Mr. Smith Goes to Washington (1939)           39.60
353        354              Wedding Singer, The (1998)           36.73
729        730  Queen Margot (Reine Margot, La) (1994)           36.57
144        145               Lawnmower Man, The (1992)           36.39
154        155                    Dirty Dancing (1987)           36.17
1100      1101        Six Degrees of Separation (1993)           36.04
523        524              Great Dictator, The (1940)           35.74
487        488                     Sunset Blvd. (1950)           35.64
669        670                   Body Snatchers (1993)           35.26


### Collaborative filtering

In [6]:
def collaborative_filtering(user_id: int, model: WeightedMatrixFactorization, top_n: int=10) -> pd.DataFrame:
    """
    Perform collaborative filtering to recommend items to a user.

    Parameters:
    - user_id (int): The ID of the user for whom recommendations are to be generated.
    - model (WeightedMatrixFactorization): The trained recommendation model.
    - top_n (int): The number of top recommendations to return. Default is 10.

    Returns:
    - recommended_items (pd.DataFrame): A DataFrame containing the top recommended items and their similarities.

    Raises:
    - ValueError: If the provided user ID is invalid or if it's not present in the model.
    - ValueError: If the top-n parameter is less than or equal to 0.

    Usage:
    >>> recommended_items = collaborative_filtering(user_id, model, top_n=10)
    """

    if not isinstance(user_id, int):
        raise ValueError("The user ID must be an integer.")
    if user_id < 0 or user_id >= feedbacks.shape[0]:
        raise ValueError(f"The user ID {user_id} is invalid.")
    if top_n <= 0:
        raise ValueError("The top-n parameter must be greater than 0.")
    

    users_embedding, items_embedding = model.get_embeddings()
    user_embedding = users_embedding[user_id]

    idx_not_rated = np.where( np.isnan(
    feedbacks[user_id]) 
    )[0]

    # similarity between this user and all the other users:
    similarities = np.array([ 
    cosine_similarity(
        vector1 = users_embedding[idx],
        vector2 = user_embedding
    ) for idx in range(users_embedding.shape[0])
    ])

    sorted_indices = np.argsort(similarities)[::-1][:int(0.10*len(similarities))]
    sorted_indices = sorted_indices[1:]  # exclude the target user

    # Aggregate preferences of similar users
    aggregated_preferences = np.sum( users_embedding[sorted_indices], axis=0 )

    # Calculate cosine similarity between the target user group and items based on content features
    similarities = np.array([ 
        cosine_similarity(
            vector1 = aggregated_preferences,
            vector2 = items_embedding[idx]
        ) for idx in idx_not_rated  # iterate over non-rated items
    ])

    # sort items and similarities together according to content-based similarity in descending order
    sorted_indices = np.argsort(similarities)[::-1][:top_n]
    recommended_item_indices = idx_not_rated[sorted_indices]
    recommended_items_similarities = similarities[sorted_indices]

    # a dataframe with the recommended items and their similarities:
    recommended_items = movies.iloc[recommended_item_indices]
    recommended_items = recommended_items.copy()
    recommended_items['similarity'] = recommended_items_similarities

    return recommended_items

We can now recommend top 10 movies for the user with id = 475 using **Collaborative** filtering:

In [7]:
top_k_movies = collaborative_filtering(475, model, 20)
print(top_k_movies[['title', 'similarity']])

                                      title  similarity
173          Raiders of the Lost Ark (1981)    0.620386
63         Shawshank Redemption, The (1994)    0.590658
78                     Fugitive, The (1993)    0.582695
171         Empire Strikes Back, The (1980)    0.580510
49                         Star Wars (1977)    0.577964
175                           Aliens (1986)    0.570486
356  One Flew Over the Cuckoo's Nest (1975)    0.570391
11               Usual Suspects, The (1995)    0.569235
97         Silence of the Lambs, The (1991)    0.565038
143                         Die Hard (1988)    0.553481
194                  Terminator, The (1984)    0.551391
650                            Glory (1989)    0.548086
180               Return of the Jedi (1983)    0.547446
317                 Schindler's List (1993)    0.545705
21                        Braveheart (1995)    0.545465
190                          Amadeus (1984)    0.542731
95        Terminator 2: Judgment Day (1991)    0