In [105]:
import pandas as pd
import math
import numpy as np

In [106]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
n_rows = ratings.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(ratings.head(10).to_string(index=False))

Number of rows: 100836
 userId  movieId  rating  timestamp
      1        1     4.0  964982703
      1        3     4.0  964981247
      1        6     4.0  964982224
      1       47     5.0  964983815
      1       50     5.0  964982931
      1       70     3.0  964982400
      1      101     5.0  964980868
      1      110     4.0  964982176
      1      151     5.0  964984041
      1      157     5.0  964984100


In [107]:
movies = pd.read_csv("ml-latest-small/movies.csv")
n_rows = movies.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(movies.head(10).to_string(index=False))

Number of rows: 9742
 movieId                              title                                      genres
       1                   Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
       2                     Jumanji (1995)                  Adventure|Children|Fantasy
       3            Grumpier Old Men (1995)                              Comedy|Romance
       4           Waiting to Exhale (1995)                        Comedy|Drama|Romance
       5 Father of the Bride Part II (1995)                                      Comedy
       6                        Heat (1995)                       Action|Crime|Thriller
       7                     Sabrina (1995)                              Comedy|Romance
       8                Tom and Huck (1995)                          Adventure|Children
       9                Sudden Death (1995)                                      Action
      10                   GoldenEye (1995)                   Action|Adventure|Thriller


In [108]:
def create_user_movie_rating_matrix():
    """
    Creates a user-movie rating matrix.

    Returns:
        DataFrame: A matrix where rows represent users, columns represent movies,
                   and each cell contains the rating given by the user to the movie.
    """
    # Create a pivot table with user ratings for each movie
    user_movie_ratings_matrix = pd.pivot_table(ratings, values='rating', index='userId', columns='movieId')
    # Get unique movie IDs
    all_movie_ids = movies['movieId'].unique()
    # Reindex the matrix columns with all movie IDs
    user_movie_ratings_matrix = user_movie_ratings_matrix.reindex(columns=all_movie_ids)
    return user_movie_ratings_matrix  # Return the user-movie rating matrix

In [109]:
def pearsonCorrelation(user1, user2):
    """
    Calculates the Pearson correlation coefficient between two users based on their ratings.

    Args:
        user1 (int): The ID of the first user.
        user2 (int): The ID of the second user.

    Returns:
        float: The Pearson correlation coefficient between the two users.
    """
    num, den1, den2 = 0.0, 0.0, 0.0
    # Fetch ratings of user1 and calculate mean rating of user1
    ratings_user1 = ratings[ratings['userId'] == user1]
    rmean_user1 = ratings_user1['rating'].mean()
    # Fetch ratings of user1 and calculate mean rating of user2
    ratings_user2 = ratings[ratings['userId'] == user2]
    rmean_user2 = ratings_user2['rating'].mean()

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    # Calculate Pearson correlation for common items
    for p in common_items:
        # Fetch ratings of user1 and user2 for the common item
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - rmean_user1)*(r_2p - rmean_user2)
        den1 += pow((r_1p - rmean_user1), 2)
        den2 += pow((r_2p - rmean_user2), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    # Check for division by zero
    if den == 0.0:
        return 0.0
    # Calculate Pearson correlation coefficient
    sim = num/den
    return sim

In [110]:
def compute_user_similarities(user):
    """
    Computes the Pearson correlation-based similarities between the given user and all other users in the ratings dataset.

    Args:
        user (int): The ID of the user for whom similarities are to be computed.

    Returns:
        dict: A dictionary containing the similarities between the given user and all other users,
              where the keys are the IDs of other users and the values are the similarity coefficients.
    """
    # Get the total number of unique users
    num_users = ratings["userId"].nunique()
    # Create a list of all user IDs except the given user
    user_list = list(range(1, num_users+1, 1))
    user_list.remove(user)
    # Initialize a dictionary to store user similarities
    user_similarities = {}
    for other_user in user_list:
        if other_user != user:
            # Calculate Pearson correlation between the given user and other users
            similarity = pearsonCorrelation(user, other_user)
            # Store the similarity coefficient in the dictionary
            user_similarities[other_user] = similarity
    return user_similarities

In [111]:
def generatePrediction(user1, item, user_movie_ratings_matrix, user_similarity_dict):
    """
    Generates a prediction for the rating that the given user would give to the specified item.

    Args:
        user1 (int): The ID of the user for whom the prediction is generated.
        item (int): The ID of the item (movie) for which the prediction is generated.
        user_movie_ratings_matrix (matrix): The user-movie ratings matrix.
        user_similarity_dict (dict): A dictionary containing similarities between users.

    Returns:
        float: The predicted rating for the specified item by the given user.
    """
    # Check if the movie has already been rated by the user
    if not np.isnan(user_movie_ratings_matrix.at[user1, item]):
        return user_movie_ratings_matrix.at[user1, item]
    
    num, den = 0.0, 0.0
    # Calculate the average of the user's ratings
    rmean_user1 = user_movie_ratings_matrix.loc[user1].mean()
    # Find users who have rated the item
    users_who_rated_item = user_movie_ratings_matrix[item].dropna().index.unique()
    for u in users_who_rated_item:
        rmean_u = user_movie_ratings_matrix.loc[u].mean()
        r_up = user_movie_ratings_matrix.at[u, item]
        similarity = user_similarity_dict[u]
        num += similarity * (r_up - rmean_u)
        den += abs(similarity)
    if den == 0.0:
        pred = rmean_user1
    else:
        pred = rmean_user1 + (num / den)
    return pred

In [112]:
def get_unrated_movie_ids(userId):
    """
    Finds the movie IDs that have not been rated by the specified user.

    Args:
        userId (int): The ID of the user.

    Returns:
        list: A list of movie IDs that have not been rated by the specified user.
    """
    # Get all unique movie IDs present in the ratings DataFrame
    all_movie_ids = movies['movieId'].tolist()
    # Get the movie IDs rated by the specified userId
    rated_movie_ids = ratings[ratings['userId'] == userId]['movieId'].tolist()
    # Find the movie IDs not rated by the userId
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    return unrated_movie_ids

# a function that, given a list of users as input, returns the union of unrated movies for each user.
def get_all_unrated_movies(group):
    unrated_movies = set()
    # for each user call the function getUnratedMovies, which returns the movies not rated by that user
    for user in group:
        user_unrated_movies = get_unrated_movie_ids(user)
        #and update the list to store all of them
        unrated_movies.update(user_unrated_movies)
    return list(unrated_movies)

# Assignment 3

In [113]:
def generatePredictionOnUnratedMovies(user, unrated_movies, ratings_matrix, similarity_dict):
    predictions = []
    for item in unrated_movies:
        pred = generatePrediction(user, item, ratings_matrix, similarity_dict)
        predictions.append((item, pred))
    return predictions

In [114]:
# def topKMoviesHybrid(k, alfa, predictions, tabu_movies):
#     # List to store the hybrid predictions
#     hybrid_predictions = []

#     # Iterate over each movie
#     for i in range(len(predictions[0])):
#         # Calculate the average rating for each movie
#         avg_rating = sum(prediction[i][1] for prediction in predictions) / len(predictions)
        
#         # Calculate the minimum rating for each movie
#         min_rating = min(prediction[i][1] for prediction in predictions)

#         # Build the list of tuples (movie_id, hybrid_rating, individual_ratings)
#         movie_id = predictions[0][i][0]
#         individual_ratings = [prediction[i][1] for prediction in predictions]
#         if movie_id not in tabu_movies:
#             hybrid_rating = (1 - alfa) * avg_rating + alfa * min_rating
#             hybrid_predictions.append((movie_id, hybrid_rating, individual_ratings))
    
#     if(len(hybrid_predictions) < k):
#         k = len(hybrid_predictions)

#     # Sort the list based on the hybrid rating (in descending order)
#     sorted_predictions = sorted(hybrid_predictions, key=lambda x: x[1], reverse=True)
    
#     # Take only the first k elements from the sorted list
#     top_k_predictions = sorted_predictions[:k]
    
#     return top_k_predictions

def topKMoviesHybrid(k, alfa, predictions, tabu_movies):
    # Dictionary to store the hybrid predictions
    hybrid_predictions = {}

    # Iterate over each movie
    for i in range(len(predictions[0])):
        movie_id = predictions[0][i][0]
        if movie_id in tabu_movies:
            continue
        
        # Calculate the average rating for each movie
        avg_rating = sum(prediction[i][1] for prediction in predictions) / len(predictions)
        
        # Calculate the minimum rating for each movie
        min_rating = min(prediction[i][1] for prediction in predictions)

        # Calculate the hybrid rating
        hybrid_rating = (1 - alfa) * avg_rating + alfa * min_rating

        # Store the data in the hybrid predictions dictionary
        hybrid_predictions[movie_id] = (hybrid_rating, [prediction[i][1] for prediction in predictions])
    
    # Sort the dictionary based on the hybrid rating (in descending order)
    sorted_predictions = sorted(hybrid_predictions.items(), key=lambda x: x[1][0], reverse=True)
    
    # Take only the first k elements from the sorted list
    top_k_predictions = sorted_predictions[:k]
    
    return top_k_predictions

In [115]:
def getUsersSatisfaction(group, group_top_k, user_top_k):

    usersListSat = [sum(rating for _, rating in elem) for elem in user_top_k]
    groupListSat = list(map(sum, zip(*[elem[2] for elem in group_top_k])))
    satisfactions = []
    for i in range(0,len(group)):
        num = groupListSat[i]
        den = usersListSat[i]
        sat = num/den
        satisfactions.append(sat)
    return satisfactions  

In [116]:
# def sequentialGroupPrediction(group, k, n_iter):
#     # union of unrated movies
#     unrated_movies = get_all_unrated_movies(group)
#     #ratings matrix
#     ratings_matrix = create_user_movie_rating_matrix()
#     # Compute user similarities for each user in the group
#     users_similarities_dict = {}        
#     predictions = []
#     user_top_k = []   #lista di coppie (item,voto)
#     for user in group:
#         users_similarities_dict[user] = compute_user_similarities(user)
#         user_predictions = generatePredictionOnUnratedMovies(user, unrated_movies,ratings_matrix,users_similarities_dict[user])
#         predictions.append(user_predictions)
#         sorted_predictions = sorted(user_predictions, key=lambda x: x[1], reverse=True)
#         top_k = sorted_predictions[:k]
#         user_top_k.append(top_k)
        
#     alfa = 0
#     tabu_movies = []
#     # lista dei top k film per il gruppo
#     group_top_k = [] 
#     for i in range(0,n_iter):
#         group_top_k = topKMoviesHybrid(k, alfa, predictions, tabu_movies)
#         users_satisfactions = getUsersSatisfaction(group, group_top_k, user_top_k)
#         print("Iteration: ", i)
#         print("Alfa: ", alfa)
#         for movie, avg_rating, _ in group_top_k:
#             print("Movie ID:", movie, ", Score:", avg_rating)
#             tabu_movies.append(movie)
#         for i in range(0, len(group)):
#             print("User ID: ", group[i], ", Satisfaction:", users_satisfactions[i])
#         alfa = max(users_satisfactions) - min(users_satisfactions)
def sequentialGroupPrediction(group, k, n_iter):
    # union of unrated movies
    unrated_movies = get_all_unrated_movies(group)
    # ratings matrix
    ratings_matrix = create_user_movie_rating_matrix()
    # Compute user similarities for each user in the group
    users_similarities_dict = {}
    user_predictions_dict = {}  # Dizionario delle previsioni per ciascun utente
    user_top_k_dict = {}  # Dizionario dei top k per ciascun utente
    for user in group:
        users_similarities_dict[user] = compute_user_similarities(user)
        user_predictions = generatePredictionOnUnratedMovies(user, unrated_movies, ratings_matrix, users_similarities_dict[user])
        user_predictions_dict[user] = user_predictions
        sorted_predictions = sorted(user_predictions, key=lambda x: x[1], reverse=True)
        user_top_k_dict[user] = sorted_predictions[:k]

    alfa = 0
    tabu_movies = []
    # lista dei top k film per il gruppo
    group_top_k = []
    for i in range(0, n_iter):
        # Convertiamo il dizionario delle previsioni degli utenti in una lista per passarlo a topKMoviesHybrid
        predictions = [user_predictions_dict[user] for user in group]
        group_top_k = topKMoviesHybrid(k, alfa, predictions, tabu_movies)
        users_satisfactions = getUsersSatisfaction(group, group_top_k, list(user_top_k_dict.values()))
        print("Iteration: ", i)
        print("Alfa: ", alfa)
        for movie, avg_rating, _ in group_top_k:
            print("Movie ID:", movie, ", Score:", avg_rating)
            tabu_movies.append(movie)
        for i in range(0, len(group)):
            print("User ID: ", group[i], ", Satisfaction:", users_satisfactions[i])
        alfa = max(users_satisfactions) - min(users_satisfactions)


In [117]:
users = [414,474,599]
sequentialGroupPrediction(users,10,3)

Iteration:  0
Alfa:  0
Movie ID: 5105 , Score: 6.568890099610708
Movie ID: 6967 , Score: 6.568890099610708
Movie ID: 7114 , Score: 6.568890099610708
Movie ID: 7742 , Score: 6.568890099610708
Movie ID: 3604 , Score: 6.117294107066439
Movie ID: 97024 , Score: 5.922892562664894
Movie ID: 138186 , Score: 5.905043481723264
Movie ID: 107013 , Score: 5.905043481723264
Movie ID: 173307 , Score: 5.905043481723264
Movie ID: 86068 , Score: 5.905043481723264
User ID:  414 , Satisfaction: 1.0
User ID:  474 , Satisfaction: 0.9948250019038323
User ID:  599 , Satisfaction: 1.0
Iteration:  1
Alfa:  0.005174998096167727
Movie ID: 160872 , Score: 5.902444229769408
Movie ID: 40491 , Score: 5.896267336685065
Movie ID: 3567 , Score: 5.50172188213961
Movie ID: 156605 , Score: 5.396267336685065
Movie ID: 132333 , Score: 5.294350637504416
Movie ID: 5490 , Score: 5.294350637504416
Movie ID: 6818 , Score: 5.275850530854655
Movie ID: 25947 , Score: 5.150240886071457
Movie ID: 167772 , Score: 5.114694855112583
Mov