In [211]:
import pandas as pd
import math
from statistics import median
import numpy as np

# Assignment requests

### (a) Download the MovieLens 100K rating dataset from https://grouplens.org/datasets/movielens/ (the small dataset recommended for education and development). Read the dataset, display the first few rows to understand it, and display the count of ratings (rows) in the dataset to be sure that you download it correctly.

In [212]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
# Show the first 10 rows of the dataframe
print(ratings.head(10).to_string(index=False))

 userId  movieId  rating  timestamp
      1        1     4.0  964982703
      1        3     4.0  964981247
      1        6     4.0  964982224
      1       47     5.0  964983815
      1       50     5.0  964982931
      1       70     3.0  964982400
      1      101     5.0  964980868
      1      110     4.0  964982176
      1      151     5.0  964984041
      1      157     5.0  964984100


In [213]:
n_rows_ratings = ratings.shape[0]
print("Number of rows: " + str(n_rows_ratings))

Number of rows: 100836


In [214]:
# Check NaN values
nan_values_ratings = ratings.isna().sum()
ratings_nan_counts = pd.DataFrame({'Column': nan_values_ratings.index, 'Number of NaN': nan_values_ratings.values})
# Stampa il DataFrame
print(ratings_nan_counts.to_string(index=False))

   Column  Number of NaN
   userId              0
  movieId              0
   rating              0
timestamp              0


In [215]:
num_users = ratings["userId"].nunique()
num_users

610

In [216]:
movies = pd.read_csv("ml-latest-small/movies.csv")
# Show the first 10 rows of the dataframe
print(movies.head(10).to_string(index=False))

 movieId                              title                                      genres
       1                   Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
       2                     Jumanji (1995)                  Adventure|Children|Fantasy
       3            Grumpier Old Men (1995)                              Comedy|Romance
       4           Waiting to Exhale (1995)                        Comedy|Drama|Romance
       5 Father of the Bride Part II (1995)                                      Comedy
       6                        Heat (1995)                       Action|Crime|Thriller
       7                     Sabrina (1995)                              Comedy|Romance
       8                Tom and Huck (1995)                          Adventure|Children
       9                Sudden Death (1995)                                      Action
      10                   GoldenEye (1995)                   Action|Adventure|Thriller


In [217]:
n_rows_movies = movies.shape[0]
print("Number of rows: " + str(n_rows_movies))

Number of rows: 9742


In [218]:
nan_values_movies = movies.isna().sum()
df_nan_counts_movies = pd.DataFrame({'Column': nan_values_movies.index, 'Number of NaN': nan_values_movies.values})
print(df_nan_counts_movies.to_string(index=False))

 Column  Number of NaN
movieId              0
  title              0
 genres              0


In [219]:
num_films = movies["movieId"].nunique()
num_films

9742

### (b) Implement the user-based collaborative filtering approach, using the Pearson correlation function for computing similarities between users, and

$$sim(a, b) = \frac{\sum_{p \in I} (r_{a,p} - \bar{r_a})(r_{b,p} - \bar{r_b})}{\sqrt{\sum_{p \in I}(r_{a,p} - \bar{r_a})^2}\sqrt{\sum_{p \in I}(r_{b,p} - \bar{r_b})^2}}$$

In [220]:
def pearsonCorrelation(user1, user2):
    """
    Calculates the Pearson correlation coefficient between two users based on their ratings.

    Args:
        user1 (int): The ID of the first user.
        user2 (int): The ID of the second user.

    Returns:
        float: The Pearson correlation coefficient between the two users.
    """
    num, den1, den2 = 0.0, 0.0, 0.0
    # Fetch ratings of user1 and calculate mean rating of user1
    ratings_user1 = ratings[ratings['userId'] == user1]
    rmean_user1 = ratings_user1['rating'].mean()
    # Fetch ratings of user1 and calculate mean rating of user2
    ratings_user2 = ratings[ratings['userId'] == user2]
    rmean_user2 = ratings_user2['rating'].mean()

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    # Calculate Pearson correlation for common items
    for p in common_items:
        # Fetch ratings of user1 and user2 for the common item
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - rmean_user1)*(r_2p - rmean_user2)
        den1 += pow((r_1p - rmean_user1), 2)
        den2 += pow((r_2p - rmean_user2), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    # Check for division by zero
    if den == 0.0:
        return 0.0
    # Calculate Pearson correlation coefficient
    sim = num/den
    return sim

### (c) the prediction function presented in class for predicting movies scores.

$$pred(a, p) = \bar{r_a} + \frac{\sum_{b \in N} sim(a, b) * (r_{b,p} - \bar{r_b})} {\sum_{b \in N}sim(a, b)}$$

In [221]:
def create_user_movie_rating_matrix():
    """
    Creates a user-movie rating matrix.

    Returns:
        DataFrame: A matrix where rows represent users, columns represent movies,
                   and each cell contains the rating given by the user to the movie.
    """
    # Create a pivot table with user ratings for each movie
    user_movie_ratings_matrix = pd.pivot_table(ratings, values='rating', index='userId', columns='movieId')
    # Get unique movie IDs
    all_movie_ids = movies['movieId'].unique()
    # Reindex the matrix columns with all movie IDs
    user_movie_ratings_matrix = user_movie_ratings_matrix.reindex(columns=all_movie_ids)
    return user_movie_ratings_matrix  # Return the user-movie rating matrix

In [222]:
def compute_user_similarities(user):
    """
    Computes the Pearson correlation-based similarities between the given user and all other users in the ratings dataset.

    Args:
        user (int): The ID of the user for whom similarities are to be computed.

    Returns:
        dict: A dictionary containing the similarities between the given user and all other users,
              where the keys are the IDs of other users and the values are the similarity coefficients.
    """
    # Get the total number of unique users
    num_users = ratings["userId"].nunique()
    # Create a list of all user IDs except the given user
    user_list = list(range(1, num_users+1, 1))
    user_list.remove(user)
    # Initialize a dictionary to store user similarities
    user_similarities = {}
    for other_user in user_list:
        if other_user != user:
            # Calculate Pearson correlation between the given user and other users
            similarity = pearsonCorrelation(user, other_user)
            # Store the similarity coefficient in the dictionary
            user_similarities[other_user] = similarity
    return user_similarities

In [223]:
def generatePrediction(user1, item, user_movie_ratings_matrix, user_similarity_dict):
    """
    Generates a prediction for the rating that the given user would give to the specified item.

    Args:
        user1 (int): The ID of the user for whom the prediction is generated.
        item (int): The ID of the item (movie) for which the prediction is generated.
        user_movie_ratings_matrix (matrix): The user-movie ratings matrix.
        user_similarity_dict (dict): A dictionary containing similarities between users.

    Returns:
        float: The predicted rating for the specified item by the given user.
    """
    # Check if the movie has already been rated by the user
    if not np.isnan(user_movie_ratings_matrix.at[user1, item]):
        return user_movie_ratings_matrix.at[user1, item]
    
    num, den = 0.0, 0.0
    # Calculate the average of the user's ratings
    rmean_user1 = user_movie_ratings_matrix.loc[user1].mean()
    # Find users who have rated the item
    users_who_rated_item = user_movie_ratings_matrix[item].dropna().index.unique()
    for u in users_who_rated_item:
        rmean_u = user_movie_ratings_matrix.loc[u].mean()
        r_up = user_movie_ratings_matrix.at[u, item]
        similarity = user_similarity_dict[u]
        num += similarity * (r_up - rmean_u)
        den += abs(similarity)
    if den == 0.0:
        pred = rmean_user1
    else:
        pred = rmean_user1 + (num / den)
    return round(pred,2)

### (d) Select a user from the dataset, and for this user, show the 10 most similar users

In [224]:
def kMostSimilarUsers(user, k):
    """
    Finds the top k most similar users to the given user based on Pearson correlation coefficients.

    Args:
        user (int): The ID of the user for whom similar users are to be found.
        k (int): The number of similar users to return.

    Returns:
        tuple: A tuple containing two lists:
               - The IDs of the top k most similar users to the given user.
               - The corresponding Pearson correlation coefficients between the given user and each of the top k similar users.
    """
    users_similarities = []
    num_users = ratings["userId"].nunique()
    # Iterate through all users and calculate Pearson correlation between the given user and each other user
    for u in range(1,num_users):
        # Exclude the given user and 
        if u != user:
            sim = pearsonCorrelation(user, u)
            users_similarities.append((u,sim))
    # Sort users by similarity coefficient in descending order
    sorted_users_similarities = sorted(users_similarities, reverse=True, key=lambda x: x[1])
    # Extract the top k similar users and their corresponding similarity coefficients
    top_k_users = [pair[0] for pair in sorted_users_similarities[:k]]
    top_k_sim = [pair[1] for pair in sorted_users_similarities[:k]]
    print(f"The top {k} most similar users to the user {user} are:")
    i = 1
    for user_id, similarity in enumerate(zip(top_k_users, top_k_sim)):
        print(f"{i}. User ID: {user_id}, Similarity: {similarity}")
        i += 1

### and the 10 most relevant movies that the recommender suggests.

In [225]:
def get_unrated_movie_ids(userId):
    """
    Finds the movie IDs that have not been rated by the specified user.

    Args:
        userId (int): The ID of the user.

    Returns:
        list: A list of movie IDs that have not been rated by the specified user.
    """
    # Get all unique movie IDs present in the ratings DataFrame
    all_movie_ids = movies['movieId'].tolist()
    # Get the movie IDs rated by the specified userId
    rated_movie_ids = ratings[ratings['userId'] == userId]['movieId'].tolist()
    # Find the movie IDs not rated by the userId
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    return unrated_movie_ids

In [226]:
def kMostRelevantMovies(user, k):
    """
    Finds the top k most relevant unrated movies for the specified user based on predicted ratings.

    Args:
        user (int): The ID of the user for whom relevant movies are to be found.
        k (int): The number of relevant movies to return.

    Returns:
        list: A list of tuples containing the IDs and predicted ratings of the top k most relevant movies,
              sorted by predicted rating in descending order.
    """
    # Create user-movie rating matrix
    user_movie_ratings_matrix = create_user_movie_rating_matrix()
    # Get unrated movie IDs for the user
    unrated_movies = get_unrated_movie_ids(user)
    # Initialize a list to store predictions
    predictions = []
    # Compute user similarities
    user_similarities_dict = compute_user_similarities(user)
    # Generate predictions for unrated movies
    for m in unrated_movies:
        pred = generatePrediction(user, m, user_movie_ratings_matrix,user_similarities_dict)
        predictions.append((m, pred)) 
    # Sort predictions by predicted rating in descending order
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    # Select top k predictions
    top_k_predictions = sorted_predictions[:k]
    print(f"The top {k} movies recommended for the user {user} are:")
    i = 1
    for movie_id, pred in top_k_predictions:
        movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        print(f"{i}. MovieID: {movie_id}, Title: {movie_title}, Score: {pred}")
        i += 1

### (e) Design and implement a new similarity function for computing similarities between users. Explain why this similarity function is useful for the collaborative filtering approach. Hint: Exploiting ideas from related works are highly encouraged.

#### Constrained Pearson Correlation Coefficient (CPCC)
$$sim(a, b)^{CPCC} = \frac{\sum_{p \in I} (r_{a,p} - r_{med})(r_{b,p} - r_{med})}{\sqrt{\sum_{p \in I}(r_{a,p} - r_{med})^2}\sqrt{\sum_{p \in I}(r_{b,p} - r_{med})^2}}$$

In [227]:
def constrainedPearsonCorrelation(user1, user2):
    num, den1, den2 = 0.0, 0.0, 0.0
    possible_ratings = ratings['rating'].unique().tolist()
    possible_ratings.sort()
    median_value = median(possible_ratings)

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    for p in common_items:
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - median_value)*(r_2p - median_value)
        den1 += pow((r_1p - median_value), 2)
        den2 += pow((r_2p - median_value), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    if den == 0.0:
        return 0.0
    sim = num/den
    return sim

#### Weighted Pearson Correlation Coefficient (WPCC)
$$ sim(a,b)^{WPCC}=\begin{cases} sim(a,b)^{PCC}*\frac{|I|}{H}, & |I|\le{H} \\ sim(a,b)^{PCC}, & otherwise
\end{cases} $$
##### where I represents the set of common rating items by user u and v and H is an experimental value and it is set 50 

In [228]:
def weightedPearsonCorrelation(user1, user2):
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)
    
    similarity = pearsonCorrelation(user1, user2)
    I = len(common_items)
    H = 50
    if I <= H:
        sim = similarity * (I/H)
    else:
        sim = similarity
    return sim

#### Sigmoid Function based Pearson correlation coefficient (SPCC) 
$$sim(a, b)^{SPCC} = sim(a, b)^{PCC} * \frac{1}{1 + \exp{-(\frac{|I|}{2})}}$$

In [229]:
def sigmoidFunctionBasedPearsonCorrelation(user1, user2):
    similarity = pearsonCorrelation(user1,user2)
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)
    I = len(common_items)
    sim = similarity * (1/(1+math.exp(-(I/2))))
    return sim

####  Adjusted Cosine Measure
$$sim(a, b)^{ACOS} = \frac{\sum_{p \in P} (r_{a,p} - \bar{r_a})(r_{b,p} - \bar{r_b})}{\sqrt{\sum_{p \in P}(r_{a,p} - \bar{r_a})^2}\sqrt{\sum_{p \in P}(r_{b,p} - \bar{r_b})^2}}$$
##### where $P$ is the set of all items. If user $u$ has not rated the item $p∈P$, the rating $r_{u,p}$ is zero.

In [230]:
def adjustedCosineMeasure(user1, user2):
    num, den1, den2 = 0.0, 0.0, 0.0
    ratings_user1 = ratings[ratings['userId'] == user1]
    rmean_user1 = ratings_user1['rating'].mean()
    ratings_user2 = ratings[ratings['userId'] == user2]
    rmean_user2 = ratings_user2['rating'].mean()

    all_items = movies['movieId'].tolist()
    
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])

    for p in all_items:
        if p not in items_user1:
            r_1p = 0.0
        else:
            r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        if p not in items_user2:
            r_2p = 0.0
        else:
            r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - rmean_user1)*(r_2p - rmean_user2)
        den1 += pow((r_1p - rmean_user1), 2)
        den2 += pow((r_2p - rmean_user2), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    if den == 0.0:
        return 0.0
    sim = num/den
    return sim

#### Jaccard
$$sim(a, b)^{Jaccard} = \frac{|I_a| \cap {|I_b|}}{|I_a| \cup {|I_b|}}$$

In [231]:
def jaccard(user1, user2):
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])

    intersection = len(items_user1.intersection(items_user2))
    union = len(items_user1.union(items_user2))
    return intersection / union

#### Mean Squared Difference (MSD)
$$sim(a, b)^{MSD} = 1 - \frac{\sum_{p \in I} (r_{a,p} - r_{b,p})^2}{|I|}$$

In [232]:
def meanSquaredDifference(user1, user2):
    num = 0.0
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)
    I = len(common_items)

    for p in common_items:    
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += pow((r_1p - r_2p), 2)
    
    if I != 0:
        sim = 1 - (num / I)
    else:
        sim = 0.0
    return sim

####  Jaccard and MSD can be combined to form a new metric. 
$$sim(a, b)^{JMSD} = sim(a, b)^{Jaccard}*sim(a, b)^{MSD}$$

In [233]:
def JaccardAndMSD(user1, user2):
    return jaccard(user1,user2)*meanSquaredDifference(user1,user2)

# Experiments

In [234]:
user_movie_ratings_matrix = create_user_movie_rating_matrix()
user_movie_ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [235]:
corr = pearsonCorrelation(1,21)
print(corr)

0.08648178161396775


In [236]:
print(generatePrediction(24,1987,create_user_movie_rating_matrix(),compute_user_similarities(1)))

2.78


In [237]:
kMostSimilarUsers(3, 10)

The top 10 most similar users to the user 3 are:
1. User ID: 0, Similarity: (14, 1.0)
2. User ID: 1, Similarity: (34, 1.0)
3. User ID: 2, Similarity: (55, 1.0)
4. User ID: 3, Similarity: (65, 1.0)
5. User ID: 4, Similarity: (73, 1.0)
6. User ID: 5, Similarity: (75, 1.0)
7. User ID: 6, Similarity: (80, 1.0)
8. User ID: 7, Similarity: (82, 1.0)
9. User ID: 8, Similarity: (98, 1.0)
10. User ID: 9, Similarity: (121, 1.0)


In [238]:
unrated_movies = get_unrated_movie_ids(1)

In [239]:
kMostRelevantMovies(1,10)

The top 10 movies recommended for the user 1 are:
1. MovieID: 5105, Title: Don't Look Now (1973), Score: 7.79
2. MovieID: 6967, Title: Dead of Night (1945), Score: 7.79
3. MovieID: 7114, Title: Collector, The (1965), Score: 7.79
4. MovieID: 7742, Title: Baxter (1989), Score: 7.79
5. MovieID: 175475, Title: The Emoji Movie (2017), Score: 7.57
6. MovieID: 184641, Title: Fullmetal Alchemist 2018 (2017), Score: 7.57
7. MovieID: 168712, Title: Fifty Shades Darker (2017), Score: 7.46
8. MovieID: 3604, Title: Gypsy (1962), Score: 7.34
9. MovieID: 97024, Title: Rust and Bone (De rouille et d'os) (2012), Score: 7.14
10. MovieID: 40491, Title: Match Factory Girl, The (Tulitikkutehtaan tyttö) (1990), Score: 7.12


In [240]:
# Crea un dizionario contenente i risultati delle funzioni
results = {
    "Constrained Pearson": [constrainedPearsonCorrelation(1, 21), constrainedPearsonCorrelation(18, 590), constrainedPearsonCorrelation(345, 96)],
    "Weighted Pearson": [weightedPearsonCorrelation(1, 21), weightedPearsonCorrelation(18, 590), weightedPearsonCorrelation(345, 96)],
    "Sigmoid Function Based": [sigmoidFunctionBasedPearsonCorrelation(1, 21), sigmoidFunctionBasedPearsonCorrelation(18, 590), sigmoidFunctionBasedPearsonCorrelation(345, 96)],
    "Adjusted Cosine": [adjustedCosineMeasure(1, 21), adjustedCosineMeasure(118, 590), adjustedCosineMeasure(345, 96)],
    "Jaccard": [jaccard(1, 21), jaccard(18, 590), jaccard(345, 96)],
    "Mean Squared Difference": [meanSquaredDifference(1, 21), meanSquaredDifference(18, 590), meanSquaredDifference(345, 96)],
    "Jaccard and MSD": [JaccardAndMSD(1, 21), JaccardAndMSD(18, 590), JaccardAndMSD(345, 96)]
}

# Crea il DataFrame utilizzando il dizionario e specifica gli indici
df = pd.DataFrame(results, index=[(1, 21),(18, 590),(345, 96)])

# Stampa il DataFrame
df

Unnamed: 0,Constrained Pearson,Weighted Pearson,Sigmoid Function Based,Adjusted Cosine,Jaccard,Mean Squared Difference,Jaccard and MSD
"(1, 21)",0.659342,0.081293,0.086482,0.967157,0.074841,-0.760638,-0.056927
"(18, 590)",0.813353,0.572172,0.572172,0.960133,0.223881,0.488889,0.109453
"(345, 96)",0.925701,0.057289,0.630754,0.992803,0.029412,0.4375,0.012868
