In [9]:
import pandas as pd
import math
import numpy as np

In [10]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
n_rows = ratings.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(ratings.head(10).to_string(index=False))

Number of rows: 100836
 userId  movieId  rating  timestamp
      1        1     4.0  964982703
      1        3     4.0  964981247
      1        6     4.0  964982224
      1       47     5.0  964983815
      1       50     5.0  964982931
      1       70     3.0  964982400
      1      101     5.0  964980868
      1      110     4.0  964982176
      1      151     5.0  964984041
      1      157     5.0  964984100


In [11]:
movies = pd.read_csv("ml-latest-small/movies.csv")
n_rows = movies.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(movies.head(10).to_string(index=False))

Number of rows: 9742
 movieId                              title                                      genres
       1                   Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
       2                     Jumanji (1995)                  Adventure|Children|Fantasy
       3            Grumpier Old Men (1995)                              Comedy|Romance
       4           Waiting to Exhale (1995)                        Comedy|Drama|Romance
       5 Father of the Bride Part II (1995)                                      Comedy
       6                        Heat (1995)                       Action|Crime|Thriller
       7                     Sabrina (1995)                              Comedy|Romance
       8                Tom and Huck (1995)                          Adventure|Children
       9                Sudden Death (1995)                                      Action
      10                   GoldenEye (1995)                   Action|Adventure|Thriller


### Functions implemented for assignment 1

In [12]:
def create_user_movie_rating_matrix():
    """
    Creates a user-movie rating matrix.

    Returns:
        DataFrame: A matrix where rows represent users, columns represent movies,
                   and each cell contains the rating given by the user to the movie.
    """
    # Create a pivot table with user ratings for each movie
    user_movie_ratings_matrix = pd.pivot_table(ratings, values='rating', index='userId', columns='movieId')
    # Get unique movie IDs
    all_movie_ids = movies['movieId'].unique()
    # Reindex the matrix columns with all movie IDs
    user_movie_ratings_matrix = user_movie_ratings_matrix.reindex(columns=all_movie_ids)
    return user_movie_ratings_matrix  # Return the user-movie rating matrix

In [13]:
def pearsonCorrelation(user1, user2):
    """
    Calculates the Pearson correlation coefficient between two users based on their ratings.

    Args:
        user1 (int): The ID of the first user.
        user2 (int): The ID of the second user.

    Returns:
        float: The Pearson correlation coefficient between the two users.
    """
    num, den1, den2 = 0.0, 0.0, 0.0
    # Fetch ratings of user1 and calculate mean rating of user1
    ratings_user1 = ratings[ratings['userId'] == user1]
    rmean_user1 = ratings_user1['rating'].mean()
    # Fetch ratings of user1 and calculate mean rating of user2
    ratings_user2 = ratings[ratings['userId'] == user2]
    rmean_user2 = ratings_user2['rating'].mean()

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    # Calculate Pearson correlation for common items
    for p in common_items:
        # Fetch ratings of user1 and user2 for the common item
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - rmean_user1)*(r_2p - rmean_user2)
        den1 += pow((r_1p - rmean_user1), 2)
        den2 += pow((r_2p - rmean_user2), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    # Check for division by zero
    if den == 0.0:
        return 0.0
    # Calculate Pearson correlation coefficient
    sim = num/den
    return sim

In [14]:
def compute_user_similarities(user):
    """
    Computes the Pearson correlation-based similarities between the given user and all other users in the ratings dataset.

    Args:
        user (int): The ID of the user for whom similarities are to be computed.

    Returns:
        dict: A dictionary containing the similarities between the given user and all other users,
              where the keys are the IDs of other users and the values are the similarity coefficients.
    """
    # Get the total number of unique users
    num_users = ratings["userId"].nunique()
    # Create a list of all user IDs except the given user
    user_list = list(range(1, num_users+1, 1))
    user_list.remove(user)
    # Initialize a dictionary to store user similarities
    user_similarities = {}
    for other_user in user_list:
        if other_user != user:
            # Calculate Pearson correlation between the given user and other users
            similarity = pearsonCorrelation(user, other_user)
            # Store the similarity coefficient in the dictionary
            user_similarities[other_user] = similarity
    return user_similarities

In [15]:
def generatePrediction(user1, item, user_movie_ratings_matrix, user_similarity_dict):
    """
    Generates a prediction for the rating that the given user would give to the specified item.

    Args:
        user1 (int): The ID of the user for whom the prediction is generated.
        item (int): The ID of the item (movie) for which the prediction is generated.
        user_movie_ratings_matrix (matrix): The user-movie ratings matrix.
        user_similarity_dict (dict): A dictionary containing similarities between users.

    Returns:
        float: The predicted rating for the specified item by the given user.
    """
    # Check if the movie has already been rated by the user
    if not np.isnan(user_movie_ratings_matrix.at[user1, item]):
        return user_movie_ratings_matrix.at[user1, item]
    
    num, den = 0.0, 0.0
    # Calculate the average of the user's ratings
    rmean_user1 = user_movie_ratings_matrix.loc[user1].mean()
    # Find users who have rated the item
    users_who_rated_item = user_movie_ratings_matrix[item].dropna().index.unique()
    for u in users_who_rated_item:
        rmean_u = user_movie_ratings_matrix.loc[u].mean()
        r_up = user_movie_ratings_matrix.at[u, item]
        similarity = user_similarity_dict[u]
        num += similarity * (r_up - rmean_u)
        den += abs(similarity)
    if den == 0.0:
        pred = rmean_user1
    else:
        pred = rmean_user1 + (num / den)
    return pred

### (a) For producing group recommendation, we will use the user-based collaborative filtering approach as this implemented in Assignment 1. Specifically, for producing group recommendations, we will first compute the movies recommendations for each user in the group, and then we will aggregate the lists of the individual users, so as to produce a single list of movies for the group. You will implement two well established aggregation methods for producing the group recommendations.

### The first aggregation approach is the <em>average method</em>. The main idea behind this approach is that all members are considered equals. So, the rating of an item for a group of users will be given be averaging the scores of an item across all group members.

In [16]:
def averageMethod(group, item, users_similarities_dict):
    """
    Computes the predicted rating for an item using the average method.

    Args:
        group (list): A list of user IDs forming the group.
        item (int): The ID of the item (movie) for which the prediction is generated.
        users_similarities_dict (dict): A dictionary of dictionaries containing similarities between users.

    Returns:
        float: The predicted rating for the specified item based on the average rating of the group members.
    """
    # Create user-movie rating matrix
    user_movie_ratings_matrix = create_user_movie_rating_matrix()
    scores_sum = 0.0
    # Generate predictions for each user in the group and sum up the scores
    for user in group:
        user_dict = users_similarities_dict[user]
        rating = generatePrediction(user, item,user_movie_ratings_matrix,user_dict)
        scores_sum += rating
    # Calculate the average prediction
    pred = scores_sum / len(group)
    return round(pred,2)

### The second aggregation method is the <em>least misery method</em>, where one member can act as a veto for the rest of the group. In this case, the rating of an item for a group of users is computed as the minimum score assigned to that item in all group members recommendations.

In [17]:
def leastMiseryMethod(group, item, users_similarities_dict):
    """
    Computes the predicted rating for an item using the least misery method.

    Args:
        group (list): A list of user IDs forming the group.
        item (int): The ID of the item (movie) for which the prediction is generated.
        users_similarities_dict (dict): A dictionary of dictionaries containing similarities between users.

    Returns:
        float: The predicted rating for the specified item based on the least misery method.
    """
    # Create user-movie rating matrix
    user_movie_ratings_matrix = create_user_movie_rating_matrix()
    # Initialize list to store ratings
    scores = []
    # Generate predictions for each user in the group and store the ratings
    for user in group:
        user_dict = users_similarities_dict[user]
        rating = generatePrediction(user, item,user_movie_ratings_matrix,user_dict)
        scores.append(rating)
    # Choose the lowest rating as the prediction
    pred = min(scores)
    return round(pred,2)

### Produce a group of 3 users, and for this group, show the top-10 recommendations, i.e., the 10 movies with the highest prediction scores that (i) the average method suggests, and (ii) the least misery method suggest. Use the MovieLens 100K rating dataset.

In [18]:
def topKMoviesAverage(group,k):
    """
    Finds the top k movies with the highest average predicted ratings for the specified group of users.

    Args:
        group (list): A list of user IDs forming the group.
        k (int): The number of top movies to return.

    Returns:
        list: A list of tuples containing the IDs and predicted ratings of the top k movies
              with the highest average predicted ratings, sorted by predicted rating in descending order.
    """
    # Get all movie IDs
    all_movie_ids = movies['movieId'].tolist()
    # Initialize list to store predictions and a dictionary to store user similarities
    predictions = []
    users_similarities_dict = {}
    # Compute user similarities for each user in the group
    for user in group:
        users_similarities_dict[user] = compute_user_similarities(user)
    # Generate predictions for each movie
    for m in all_movie_ids:
        pred = averageMethod(group, m, users_similarities_dict)
        predictions.append((m, pred)) 
    # Sort predictions by predicted rating in descending order
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    # Select top k predictions
    top_k_predictions = sorted_predictions[:k]
    for movie_id, pred in top_k_predictions:
        movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        print(f"MovieID: {movie_id}, Titolo: {movie_title}, Punteggio previsto: {pred}")

In [19]:
def topKMoviesLeastMisery(group,k):
    """
    Finds the top k movies with the highest predicted ratings using the least misery method for the specified group of users.

    Args:
        group (list): A list of user IDs forming the group.
        k (int): The number of top movies to return.

    Returns:
        list: A list of tuples containing the IDs and predicted ratings of the top k movies
              with the highest predicted ratings using the least misery method, sorted by predicted rating in descending order.
    """
    # Get all movie IDs
    all_movie_ids = movies['movieId'].tolist()
    # Initialize list to store predictions and a dictionary to store user similarities
    predictions = []
    users_similarities_dict = {}
    # Compute user similarities for each user in the group
    for user in group:
        users_similarities_dict[user] = compute_user_similarities(user)
    # Generate predictions for each movie using the least misery method
    for m in all_movie_ids:
        pred = leastMiseryMethod(group, m, users_similarities_dict)
        predictions.append((m, pred)) 
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    top_k_predictions = sorted_predictions[:k]
    for movie_id, pred in top_k_predictions:
        movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        print(f"MovieID: {movie_id}, Titolo: {movie_title}, Punteggio previsto: {pred}")

### (b) The methods employed in part (a) of Assignment 2, do not consider any disagreements between the users in the group. In part (b) of Assignment 2, define a way for counting the disagreements between the users in a group, and propose a method that takes disagreements into account when computing suggestions for the group.

In [45]:
def weightedAverageMethod(group, item, users_similarities_dict):
    group_size = len(group)
    sum_disagreements = 0
    ratings = {}
    user_movie_rating_matrix = create_user_movie_rating_matrix()
    weighted_sum = 0
    # Calculate predicted ratings for each user in the group
    for user in group:
        rating = generatePrediction(user, item, user_movie_rating_matrix, users_similarities_dict)
        ratings[user] = rating
        sum_disagreements += (1/group_size) * rating
    
    user_disagreements = {}
    disagreements_sum = 0
    # Calculate disagreement for each user
    for user in group:
        user_disagreement = abs(ratings[user] - sum_disagreements)
        user_disagreements[user] = user_disagreement
        weighted_sum += user_disagreement * ratings[user]
        disagreements_sum += user_disagreement

    if sum_disagreements == 0:
        return averageMethod(group, item, users_similarities_dict)
    else:
        return round(weighted_sum/disagreements_sum,2)

In [46]:
def topKMoviesWeightedAverage(group,k):
    """
    Finds the top k movies with the highest predicted ratings using the least misery method for the specified group of users.

    Args:
        group (list): A list of user IDs forming the group.
        k (int): The number of top movies to return.

    Returns:
        list: A list of tuples containing the IDs and predicted ratings of the top k movies
              with the highest predicted ratings using the least misery method, sorted by predicted rating in descending order.
    """
    # Get all movie IDs
    all_movie_ids = movies['movieId'].tolist()
    # Initialize list to store predictions and a dictionary to store user similarities
    predictions = []
    users_similarities_dict = {}
    # Compute user similarities for each user in the group
    for user in group:
        users_similarities_dict[user] = compute_user_similarities(user)
    # Generate predictions for each movie using the least misery method
    for m in all_movie_ids:
        pred = weightedAverageMethod(group, m, users_similarities_dict)
        predictions.append((m, pred)) 
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    top_k_predictions = sorted_predictions[:k]
    for movie_id, pred in top_k_predictions:
        movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        print(f"MovieID: {movie_id}, Titolo: {movie_title}, Punteggio previsto: {pred}")

# Experiments

In [22]:
user_dict1 = compute_user_similarities(1)
user_dict4 = compute_user_similarities(4)
user_dict9 = compute_user_similarities(9)
users_dict = {1:user_dict1,4:user_dict4,9:user_dict9}
averageMethod([1,4,9],50,users_dict)
leastMiseryMethod([1,4,9],50,users_dict)
weightedAverageMethod([1,4,9],50,users_dict)

4.1

In [23]:
topKMoviesAverage([1,18,23],10)

MovieID: 5105, Titolo: Don't Look Now (1973), Punteggio previsto: 7.34
MovieID: 6967, Titolo: Dead of Night (1945), Punteggio previsto: 7.34
MovieID: 7114, Titolo: Collector, The (1965), Punteggio previsto: 7.34
MovieID: 7742, Titolo: Baxter (1989), Punteggio previsto: 7.34
MovieID: 8620, Titolo: Exterminating Angel, The (Ángel exterminador, El) (1962), Punteggio previsto: 6.42
MovieID: 2068, Titolo: Fanny and Alexander (Fanny och Alexander) (1982), Punteggio previsto: 6.29
MovieID: 7564, Titolo: Kwaidan (Kaidan) (1964), Punteggio previsto: 6.27
MovieID: 6140, Titolo: Tenebre (1982), Punteggio previsto: 5.92
MovieID: 25947, Titolo: Unfaithfully Yours (1948), Punteggio previsto: 5.92
MovieID: 8727, Titolo: Day of the Locust, The (1975), Punteggio previsto: 5.84


In [24]:
topKMoviesLeastMisery([1,18,23],10)

MovieID: 5105, Titolo: Don't Look Now (1973), Punteggio previsto: 7.07
MovieID: 6967, Titolo: Dead of Night (1945), Punteggio previsto: 7.07
MovieID: 7114, Titolo: Collector, The (1965), Punteggio previsto: 7.07
MovieID: 7742, Titolo: Baxter (1989), Punteggio previsto: 7.07
MovieID: 2068, Titolo: Fanny and Alexander (Fanny och Alexander) (1982), Punteggio previsto: 5.73
MovieID: 25947, Titolo: Unfaithfully Yours (1948), Punteggio previsto: 5.66
MovieID: 8727, Titolo: Day of the Locust, The (1975), Punteggio previsto: 5.57
MovieID: 3379, Titolo: On the Beach (1959), Punteggio previsto: 5.51
MovieID: 8620, Titolo: Exterminating Angel, The (Ángel exterminador, El) (1962), Punteggio previsto: 5.5
MovieID: 7564, Titolo: Kwaidan (Kaidan) (1964), Punteggio previsto: 5.32


In [48]:
topKMoviesWeightedAverage([1,18,23],10)

  return weighted_sum/disagreements_sum


KeyboardInterrupt: 