In [54]:
import pandas as pd
import math
import numpy as np

In [55]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
n_rows = ratings.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(ratings.head(10).to_string(index=False))

Number of rows: 100836
 userId  movieId  rating  timestamp
      1        1     4.0  964982703
      1        3     4.0  964981247
      1        6     4.0  964982224
      1       47     5.0  964983815
      1       50     5.0  964982931
      1       70     3.0  964982400
      1      101     5.0  964980868
      1      110     4.0  964982176
      1      151     5.0  964984041
      1      157     5.0  964984100


In [56]:
movies = pd.read_csv("ml-latest-small/movies.csv")
n_rows = movies.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(movies.head(10).to_string(index=False))

Number of rows: 9742
 movieId                              title                                      genres
       1                   Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
       2                     Jumanji (1995)                  Adventure|Children|Fantasy
       3            Grumpier Old Men (1995)                              Comedy|Romance
       4           Waiting to Exhale (1995)                        Comedy|Drama|Romance
       5 Father of the Bride Part II (1995)                                      Comedy
       6                        Heat (1995)                       Action|Crime|Thriller
       7                     Sabrina (1995)                              Comedy|Romance
       8                Tom and Huck (1995)                          Adventure|Children
       9                Sudden Death (1995)                                      Action
      10                   GoldenEye (1995)                   Action|Adventure|Thriller


### Functions implemented for assignment 1

In [57]:
def create_user_movie_rating_matrix():
    """
    Creates a user-movie rating matrix.

    Returns:
        DataFrame: A matrix where rows represent users, columns represent movies,
                   and each cell contains the rating given by the user to the movie.
    """
    # Create a pivot table with user ratings for each movie
    user_movie_ratings_matrix = pd.pivot_table(ratings, values='rating', index='userId', columns='movieId')
    # Get unique movie IDs
    all_movie_ids = movies['movieId'].unique()
    # Reindex the matrix columns with all movie IDs
    user_movie_ratings_matrix = user_movie_ratings_matrix.reindex(columns=all_movie_ids)
    return user_movie_ratings_matrix  # Return the user-movie rating matrix

In [58]:
def get_ratings_dict():
    """
    Creates a dictionary to store movie ratings for each user.

    Returns:
        dict: A dictionary containing movie ratings for each user.
              Keys are user IDs, and values are lists of tuples (movieId, rating).
    """
    # Dictionary to store the pairs (movieId,ratings) for each user
    user_data = {}
    # Populate the dictionary
    for _, row in ratings.iterrows():
        userId = row['userId']
        movieId = int(row['movieId'])
        rating = row['rating']
        # Check if userId is already present in the dictionary
        if userId in user_data:
            user_data[userId].append((movieId, rating))
        else:
            # If userId is not present, create a new list with the tuple (movieId, rating)
            user_data[userId] = [(movieId, rating)]
    
    return user_data

In [59]:
def pearsonCorrelation(user1, user2, user_data):
    """
    Calculates the Pearson correlation coefficient between two users based on their ratings.

    Args:
        user1 (int): The ID of the first user.
        user2 (int): The ID of the second user.
        user_data (dict): A dictionary containing movieId-ratings pairs for each user.

    Returns:
        float: The Pearson correlation coefficient between the two users.
    """
    num, den1, den2 = 0.0, 0.0, 0.0
    
    # Fetch ratings of user1 and calculate mean rating of user1
    ratings_user1 = user_data[user1]
    user1_ratings = np.array([rating for _, rating in ratings_user1])
    rmean_user1 = np.mean(user1_ratings)
    
    # Fetch ratings of user2 and calculate mean rating of user2
    ratings_user2 = user_data[user2]
    user2_ratings = np.array([rating for _, rating in ratings_user2])
    rmean_user2 = np.mean(user2_ratings)

    # set of movieIds evaluated by user1
    items_user1 = {movieId for movieId, _ in ratings_user1}
    # set of movieIds evaluated by user2
    items_user2 = {movieId for movieId, _ in ratings_user2}
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1 & items_user2

    # Calculate Pearson correlation for common items
    for p in common_items:
        # Fetch ratings of user1 and user2 for the common item
        r_1p = next(r for m, r in ratings_user1 if m == p)
        r_2p = next(r for m, r in ratings_user2 if m == p)
        num += (r_1p - rmean_user1) * (r_2p - rmean_user2)
        den1 += (r_1p - rmean_user1) ** 2
        den2 += (r_2p - rmean_user2) ** 2
    
    # Calculate denominator
    den = np.sqrt(den1 * den2) if den1 != 0 and den2 != 0 else 0.0
    
    # Check for division by zero
    if den == 0.0:
        return 0.0
    
    # Calculate Pearson correlation coefficient
    sim = num / den
    return sim


In [60]:
def compute_all_user_similarities():
    """
    Computes similarities between all pairs of users based on their movie ratings.

    Returns:
        dict: A dictionary of dictionaries containing similarities between all pairs of users.
              Keys are user IDs, and values are dictionaries where keys are other user IDs
              and values are similarity scores.
    """
    num_users = ratings["userId"].nunique()
    similarities_dict = {}
    # Get movie ratings for each user
    user_data = get_ratings_dict()
    # Iterate through all user pairs
    for user1 in range(1,num_users+1):
        similarities_dict[user1] = {}
        for user2 in range(user1,num_users+1):
            if not user2 in similarities_dict:
                similarities_dict[user2] = {}
            if user1 != user2:
                # Calculate similarity between user1 and user2
                sim = pearsonCorrelation(user1,user2, user_data)
                # Store similarity for both user1 and user2 in the dictionary
                similarities_dict[user1][user2] = sim
                similarities_dict[user2][user1] = sim
    return similarities_dict 

In [61]:
def generatePrediction(user, item, user_movie_ratings_matrix, similarities_dict, rmean_user, topK_simlar_users):
    """
    Generates a prediction for a user's rating on a specific item (movie) based on collaborative filtering.

    Args:
        user (int): The ID of the user.
        item (int): The ID of the item (movie).
        user_movie_ratings_matrix (DataFrame): A DataFrame containing user-item ratings.
        similarities_dict (dict): A dictionary containing similarities between users.
        rmean_user (float): The mean rating of the user.
        topK_similar_users (list): A list of IDs of top K similar users to the target user.

    Returns:
        float: The predicted rating for the user on the specified item.
    """
    # Check if the movie has already been rated by the user
    if not np.isnan(user_movie_ratings_matrix.at[user, item]):
        return user_movie_ratings_matrix.at[user, item]
    
    num, den = 0.0, 0.0
    # Iterate through top K similar users
    for u in topK_simlar_users:
        if not np.isnan(user_movie_ratings_matrix.at[u, item]):        
            rmean_u = user_movie_ratings_matrix.loc[u].mean()   # Mean rating of the similar user u
            r_up = user_movie_ratings_matrix.at[u, item]    # Rating of item by user u
            similarity = similarities_dict[user][u]     # Similarity between target user and user u
            num += similarity * (r_up - rmean_u)
            den += abs(similarity)
    # Calculate the predicted rating
    if den == 0.0:
        # Use the mean rating of the target user if no similar users have rated the item
        pred = rmean_user
    else:
        # Calculate the prediction using the collaborative filtering formula
        pred = rmean_user + (num / den)
    return pred

In [62]:
def get_unrated_movie_ids(userId):
    """
    Finds the movie IDs that have not been rated by the specified user.

    Args:
        userId (int): The ID of the user.

    Returns:
        list: A list of movie IDs that have not been rated by the specified user.
    """
    # Get all unique movie IDs present in the ratings DataFrame
    all_movie_ids = movies['movieId'].tolist()
    # Get the movie IDs rated by the specified userId
    rated_movie_ids = ratings[ratings['userId'] == userId]['movieId'].tolist()
    # Find the movie IDs not rated by the userId
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    return unrated_movie_ids

In [63]:
def get_all_unrated_movies(users):
    """
    Gets the set of all unrated movies among a list of users.

    Args:
        users (list): A list of user IDs.

    Returns:
        list: A list containing IDs of all unrated movies among the given users.
    """
    # Initialize a set to store all unrated movies across all users
    union_unrated_movies = set()
    # Iterate through each user in the list
    for user in users:
        # get the movies not rated by that user
        user_unrated_movies = get_unrated_movie_ids(user)
        # Update the set to store all unrated movies
        union_unrated_movies.update(user_unrated_movies)
    # Convert the set to a list and return it
    return list(union_unrated_movies)

In [64]:
def kMostSimilarUsers(user, k, similarities_dict):
    """
    Gets the top k most similar users to a given user based on similarity coefficients.

    Args:
        user (int): The ID of the target user.
        k (int): The number of most similar users to retrieve.
        similarities_dict (dict): A dictionary containing similarities between users.

    Returns:
        list: A list of IDs of the top k most similar users to the target user.
    """
    # Retrieve similarity coefficients of the target user with all other users
    user_similarities = similarities_dict[user]
    # Sort users by similarity coefficient in descending order
    sorted_user_similarities = sorted(user_similarities, key=user_similarities.get, reverse=True)
    # Extract the top k similar users
    top_k_users = sorted_user_similarities[:k]
    return top_k_users

### (a) For producing group recommendation, we will use the user-based collaborative filtering approach as this implemented in Assignment 1. Specifically, for producing group recommendations, we will first compute the movies recommendations for each user in the group, and then we will aggregate the lists of the individual users, so as to produce a single list of movies for the group. You will implement two well established aggregation methods for producing the group recommendations.

### The first aggregation approach is the <em>average method</em>. The main idea behind this approach is that all members are considered equals. So, the rating of an item for a group of users will be given be averaging the scores of an item across all group members.

In [65]:
def averageMethod(group, item, user_movie_ratings_matrix, users_similarities_dict):
    """
    Computes the predicted rating for an item using the average method.

    Args:
        group (list): A list of user IDs forming the group.
        item (int): The ID of the item (movie) for which the prediction is generated.
        users_similarities_dict (dict): A dictionary of dictionaries containing similarities between users.

    Returns:
        float: The predicted rating for the specified item based on the average rating of the group members.
    """
    scores_sum = 0.0
    # Generate predictions for each user in the group and sum up the scores
    for user in group:
        rmean_user = user_movie_ratings_matrix.loc[user].mean()
        topK_simlar_users = kMostSimilarUsers(user, 50, users_similarities_dict)
        rating = generatePrediction(user, item,user_movie_ratings_matrix,users_similarities_dict,rmean_user,topK_simlar_users)
        scores_sum += rating
    # Calculate the average prediction
    pred = scores_sum / len(group)
    return round(pred,2)

### The second aggregation method is the <em>least misery method</em>, where one member can act as a veto for the rest of the group. In this case, the rating of an item for a group of users is computed as the minimum score assigned to that item in all group members recommendations.

In [66]:
def leastMiseryMethod(group, item, user_movie_ratings_matrix, users_similarities_dict):
    """
    Computes the predicted rating for an item using the least misery method.

    Args:
        group (list): A list of user IDs forming the group.
        item (int): The ID of the item (movie) for which the prediction is generated.
        users_similarities_dict (dict): A dictionary of dictionaries containing similarities between users.

    Returns:
        float: The predicted rating for the specified item based on the least misery method.
    """
    # Initialize list to store ratings
    scores = []
    # Generate predictions for each user in the group and store the ratings
    for user in group:
        rmean_user = user_movie_ratings_matrix.loc[user].mean()
        topK_simlar_users = kMostSimilarUsers(user, 50, users_similarities_dict)
        rating = generatePrediction(user, item, user_movie_ratings_matrix, users_similarities_dict,rmean_user,topK_simlar_users)
        scores.append(rating)
    # Choose the lowest rating as the prediction
    pred = min(scores)
    return round(pred,2)

### Produce a group of 3 users, and for this group, show the top-10 recommendations, i.e., the 10 movies with the highest prediction scores that (i) the average method suggests, and (ii) the least misery method suggest. Use the MovieLens 100K rating dataset.

In [67]:
def topKMoviesAverage(group,k):
    """
    Finds the top k movies with the highest average predicted ratings for the specified group of users.

    Args:
        group (list): A list of user IDs forming the group.
        k (int): The number of top movies to return.

    Returns:
        list: A list of tuples containing the IDs and predicted ratings of the top k movies
              with the highest average predicted ratings, sorted by predicted rating in descending order.
    """
    # Create user-movie rating matrix
    user_movie_ratings_matrix = create_user_movie_rating_matrix()
    # Get unrated movie IDs for the user
    unrated_movies = get_all_unrated_movies(group)
    # Initialize list to store predictions and a dictionary to store user similarities
    predictions = []
    users_similarities_dict = compute_all_user_similarities()
    # Generate predictions for each movie
    for m in unrated_movies:
        pred = averageMethod(group, m, user_movie_ratings_matrix, users_similarities_dict)
        predictions.append((m, pred)) 
    # Sort predictions by predicted rating in descending order
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    if len(sorted_predictions) <= k:
        k = len(sorted_predictions)
    # Select top k predictions
    top_k_predictions = sorted_predictions[:k]
    print(f"The top {k} movies recommended for the group {group} are:")
    i = 1
    for movie_id, pred in top_k_predictions:
        movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        print(f"{i}. MovieID: {movie_id}, Title: {movie_title}, Score: {pred}")
        i += 1

In [68]:
def topKMoviesLeastMisery(group,k):
    """
    Finds the top k movies with the highest predicted ratings using the least misery method for the specified group of users.

    Args:
        group (list): A list of user IDs forming the group.
        k (int): The number of top movies to return.

    Returns:
        list: A list of tuples containing the IDs and predicted ratings of the top k movies
              with the highest predicted ratings using the least misery method, sorted by predicted rating in descending order.
    """
    # Create user-movie rating matrix
    user_movie_ratings_matrix = create_user_movie_rating_matrix()
    # Get unrated movie IDs for the user
    unrated_movies = get_all_unrated_movies(group)
    # Initialize list to store predictions and a dictionary to store user similarities
    predictions = []
    users_similarities_dict = compute_all_user_similarities()
    # Generate predictions for each movie using the least misery method
    for m in unrated_movies:
        pred = leastMiseryMethod(group, m, user_movie_ratings_matrix, users_similarities_dict)
        predictions.append((m, pred)) 
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    if len(sorted_predictions) <= k:
        k = len(sorted_predictions)
    top_k_predictions = sorted_predictions[:k]
    print(f"The top {k} movies recommended for the group {group} are:")
    i = 1
    for movie_id, pred in top_k_predictions:
        movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        print(f"{i}. MovieID: {movie_id}, Title: {movie_title}, Score: {pred}")
        i += 1

### (b) The methods employed in part (a) of Assignment 2, do not consider any disagreements between the users in the group. In part (b) of Assignment 2, define a way for counting the disagreements between the users in a group, and propose a method that takes disagreements into account when computing suggestions for the group.

In [69]:
def weightedAverageMethod(group, item, user_movie_ratings_matrix, users_similarities_dict):
    """
    Calculates the predicted rating for an item using the weighted average method based on group ratings.

    Args:
        group (list): A list of user IDs forming the group.
        item (int): The ID of the item (movie) for which the rating is to be predicted.
        user_movie_ratings_matrix (DataFrame): A DataFrame containing user-item ratings.
        users_similarities_dict (dict): A dictionary containing similarities between users.

    Returns:
        float: The predicted rating for the item using the weighted average method.
    """
    group_size = len(group)
    sum_disagreements = 0
    # Dictionary to store predicted ratings for each user in the group
    ratings = {}
    weighted_sum = 0
    # Calculate predicted ratings for each user in the group
    for user in group:
        # Calculate the mean rating of the user
        rmean_user = user_movie_ratings_matrix.loc[user].mean()
        # Get top K similar users
        topK_simlar_users = kMostSimilarUsers(user, 50, users_similarities_dict)
        rating = generatePrediction(user, item, user_movie_ratings_matrix, users_similarities_dict, rmean_user,topK_simlar_users)
        ratings[user] = rating
        # Accumulate the sum of disagreements
        sum_disagreements += (1/group_size) * rating
    # Dictionary to store disagreement for each user
    user_disagreements = {}
    disagreements_sum = 0
    # Calculate disagreement for each user
    for user in group:
        user_disagreement = abs(ratings[user] - sum_disagreements)
        user_disagreements[user] = user_disagreement
        # Accumulate the weighted sum of ratings
        weighted_sum += user_disagreement * ratings[user]
        # Accumulate the sum of disagreements
        disagreements_sum += user_disagreement
    # Check if the sum of disagreements is zero to avoid division by zero
    if sum_disagreements == 0:
        return averageMethod(group, item, user_movie_ratings_matrix, users_similarities_dict)
    else:
        return round(weighted_sum/disagreements_sum,2)

In [70]:
def topKMoviesWeightedAverage(group,k):
    """
    Finds the top k movies with the highest predicted ratings using the least misery method for the specified group of users.

    Args:
        group (list): A list of user IDs forming the group.
        k (int): The number of top movies to return.

    Returns:
        list: A list of tuples containing the IDs and predicted ratings of the top k movies
              with the highest predicted ratings using the least misery method, sorted by predicted rating in descending order.
    """
    # Create user-movie rating matrix
    user_movie_ratings_matrix = create_user_movie_rating_matrix()
    # Get unrated movie IDs for the user
    unrated_movies = get_all_unrated_movies(group)
    # Initialize list to store predictions and a dictionary to store user similarities
    predictions = []
    users_similarities_dict = compute_all_user_similarities()
    # Generate predictions for each movie using the least misery method
    for m in unrated_movies:
        pred = weightedAverageMethod(group, m, user_movie_ratings_matrix, users_similarities_dict)
        predictions.append((m, pred)) 
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    if len(sorted_predictions) <= k:
        k = len(sorted_predictions)
    top_k_predictions = sorted_predictions[:k]
    print(f"The top {k} movies recommended for the group {group} are:")
    i = 1
    for movie_id, pred in top_k_predictions:
        movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        print(f"{i}. MovieID: {movie_id}, Title: {movie_title}, Score: {pred}")
        i += 1

# Experiments

In [74]:
topKMoviesAverage([59,125,471],10)

The top 10 movies recommended for the group [59, 125, 471] are:
1. MovieID: 1327, Title: Amityville Horror, The (1979), Score: 5.79
2. MovieID: 2700, Title: South Park: Bigger, Longer and Uncut (1999), Score: 5.71
3. MovieID: 1350, Title: Omen, The (1976), Score: 5.66
4. MovieID: 2513, Title: Pet Sematary (1989), Score: 5.61
5. MovieID: 1248, Title: Touch of Evil (1958), Score: 5.46
6. MovieID: 1231, Title: Right Stuff, The (1983), Score: 5.44
7. MovieID: 3972, Title: Legend of Drunken Master, The (Jui kuen II) (1994), Score: 5.39
8. MovieID: 1924, Title: Plan 9 from Outer Space (1959), Score: 5.36
9. MovieID: 2583, Title: Cookie's Fortune (1999), Score: 5.22
10. MovieID: 3543, Title: Diner (1982), Score: 5.17


In [75]:
topKMoviesLeastMisery([59,125,471],10)

The top 10 movies recommended for the group [59, 125, 471] are:
1. MovieID: 1248, Title: Touch of Evil (1958), Score: 5.25
2. MovieID: 1231, Title: Right Stuff, The (1983), Score: 4.99
3. MovieID: 2700, Title: South Park: Bigger, Longer and Uncut (1999), Score: 4.91
4. MovieID: 6787, Title: All the President's Men (1976), Score: 4.87
5. MovieID: 306, Title: Three Colors: Red (Trois couleurs: Rouge) (1994), Score: 4.86
6. MovieID: 750, Title: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964), Score: 4.86
7. MovieID: 1278, Title: Young Frankenstein (1974), Score: 4.83
8. MovieID: 3972, Title: Legend of Drunken Master, The (Jui kuen II) (1994), Score: 4.78
9. MovieID: 1199, Title: Brazil (1985), Score: 4.72
10. MovieID: 1230, Title: Annie Hall (1977), Score: 4.72


In [76]:
topKMoviesWeightedAverage([59,125,471],10)

The top 10 movies recommended for the group [59, 125, 471] are:
1. MovieID: 1231, Title: Right Stuff, The (1983), Score: 5.55
2. MovieID: 3972, Title: Legend of Drunken Master, The (Jui kuen II) (1994), Score: 5.55
3. MovieID: 2700, Title: South Park: Bigger, Longer and Uncut (1999), Score: 5.52
4. MovieID: 1248, Title: Touch of Evil (1958), Score: 5.46
5. MovieID: 1327, Title: Amityville Horror, The (1979), Score: 5.46
6. MovieID: 140174, Title: Room (2015), Score: 5.31
7. MovieID: 1350, Title: Omen, The (1976), Score: 5.26
8. MovieID: 306, Title: Three Colors: Red (Trois couleurs: Rouge) (1994), Score: 5.25
9. MovieID: 168492, Title: Call Me by Your Name (2017), Score: 5.25
10. MovieID: 56715, Title: Wristcutters: A Love Story (2006), Score: 5.25
