In [19]:
import pandas as pd
import numpy as np

In [20]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
n_rows = ratings.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(ratings.head(10).to_string(index=False))

Number of rows: 100836
 userId  movieId  rating  timestamp
      1        1     4.0  964982703
      1        3     4.0  964981247
      1        6     4.0  964982224
      1       47     5.0  964983815
      1       50     5.0  964982931
      1       70     3.0  964982400
      1      101     5.0  964980868
      1      110     4.0  964982176
      1      151     5.0  964984041
      1      157     5.0  964984100


In [21]:
movies = pd.read_csv("ml-latest-small/movies.csv")
n_rows = movies.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(movies.head(10).to_string(index=False))

Number of rows: 9742
 movieId                              title                                      genres
       1                   Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
       2                     Jumanji (1995)                  Adventure|Children|Fantasy
       3            Grumpier Old Men (1995)                              Comedy|Romance
       4           Waiting to Exhale (1995)                        Comedy|Drama|Romance
       5 Father of the Bride Part II (1995)                                      Comedy
       6                        Heat (1995)                       Action|Crime|Thriller
       7                     Sabrina (1995)                              Comedy|Romance
       8                Tom and Huck (1995)                          Adventure|Children
       9                Sudden Death (1995)                                      Action
      10                   GoldenEye (1995)                   Action|Adventure|Thriller


In [22]:
def create_user_movie_rating_matrix():
    """
    Creates a user-movie rating matrix.

    Returns:
        DataFrame: A matrix where rows represent users, columns represent movies,
                   and each cell contains the rating given by the user to the movie.
    """
    # Create a pivot table with user ratings for each movie
    user_movie_ratings_matrix = pd.pivot_table(ratings, values='rating', index='userId', columns='movieId')
    # Get unique movie IDs
    all_movie_ids = movies['movieId'].unique()
    # Reindex the matrix columns with all movie IDs
    user_movie_ratings_matrix = user_movie_ratings_matrix.reindex(columns=all_movie_ids)
    return user_movie_ratings_matrix  # Return the user-movie rating matrix

In [23]:
def get_ratings_dict():
    # Dizionario per memorizzare le coppie movieId,ratings per ogni utente
    user_data = {}
    # Popolare il dizionario
    for _, row in ratings.iterrows():
        userId = row['userId']
        movieId = int(row['movieId'])
        rating = row['rating']
        # Verificare se userId è già presente nel dizionario
        if userId in user_data:
            user_data[userId].append((movieId, rating))
        else:
            # Se userId non è presente, creare una nuova lista con la coppia movieId,rating
            user_data[userId] = [(movieId, rating)]
    
    return user_data

In [24]:
def pearsonCorrelation(user1, user2, user_data):
    """
    Calculates the Pearson correlation coefficient between two users based on their ratings.

    Args:
        user1 (int): The ID of the first user.
        user2 (int): The ID of the second user.
        user_data (dict): A dictionary containing movieId-ratings pairs for each user.

    Returns:
        float: The Pearson correlation coefficient between the two users.
    """
    num, den1, den2 = 0.0, 0.0, 0.0
    
    # Fetch ratings of user1 and calculate mean rating of user1
    ratings_user1 = user_data[user1]
    user1_ratings = np.array([rating for _, rating in ratings_user1])
    rmean_user1 = np.mean(user1_ratings)
    
    # Fetch ratings of user2 and calculate mean rating of user2
    ratings_user2 = user_data[user2]
    user2_ratings = np.array([rating for _, rating in ratings_user2])
    rmean_user2 = np.mean(user2_ratings)

    # set of movieIds evaluated by user1
    items_user1 = {movieId for movieId, _ in ratings_user1}
    # set of movieIds evaluated by user2
    items_user2 = {movieId for movieId, _ in ratings_user2}
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1 & items_user2

    # Calculate Pearson correlation for common items
    for p in common_items:
        # Fetch ratings of user1 and user2 for the common item
        r_1p = next(r for m, r in ratings_user1 if m == p)
        r_2p = next(r for m, r in ratings_user2 if m == p)
        num += (r_1p - rmean_user1) * (r_2p - rmean_user2)
        den1 += (r_1p - rmean_user1) ** 2
        den2 += (r_2p - rmean_user2) ** 2
    
    # Calculate denominator
    den = np.sqrt(den1 * den2) if den1 != 0 and den2 != 0 else 0.0
    
    # Check for division by zero
    if den == 0.0:
        return 0.0
    
    # Calculate Pearson correlation coefficient
    sim = num / den
    return sim


In [25]:
def compute_all_user_similarities():
    similarities_dict = {}
    user_data = get_ratings_dict()
    for user1 in range(1,611):
        similarities_dict[user1] = {}
        for user2 in range(user1,611):
            if not user2 in similarities_dict:
                similarities_dict[user2] = {}
            if user1 != user2:
                sim = pearsonCorrelation(user1,user2, user_data)
                similarities_dict[user1][user2] = sim
                similarities_dict[user2][user1] = sim
    return similarities_dict 

In [26]:
def generatePrediction(user, item, user_movie_ratings_matrix, similarities_dict, rmean_user, topK_simlar_users):
    # Check if the movie has already been rated by the user
    if not np.isnan(user_movie_ratings_matrix.at[user, item]):
        return user_movie_ratings_matrix.at[user, item]
    
    num, den = 0.0, 0.0
    
    for u in topK_simlar_users:
        if not np.isnan(user_movie_ratings_matrix.at[u, item]):
            rmean_u = user_movie_ratings_matrix.loc[u].mean()
            r_up = user_movie_ratings_matrix.at[u, item]
            similarity = similarities_dict[user][u]
            num += similarity * (r_up - rmean_u)
            den += abs(similarity)
    if den == 0.0:
        pred = rmean_user
    else:
        pred = rmean_user + (num / den)
    return pred

In [27]:
def get_unrated_movie_ids(userId):
    """
    Finds the movie IDs that have not been rated by the specified user.

    Args:
        userId (int): The ID of the user.

    Returns:
        list: A list of movie IDs that have not been rated by the specified user.
    """
    # Get all unique movie IDs present in the ratings DataFrame
    all_movie_ids = movies['movieId'].tolist()
    # Get the movie IDs rated by the specified userId
    rated_movie_ids = ratings[ratings['userId'] == userId]['movieId'].tolist()
    # Find the movie IDs not rated by the userId
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    return unrated_movie_ids

# a function that, given a list of users as input, returns the union of unrated movies for each user.
def get_all_unrated_movies(group):
    unrated_movies = set()
    # for each user call the function getUnratedMovies, which returns the movies not rated by that user
    for user in group:
        user_unrated_movies = get_unrated_movie_ids(user)
        #and update the list to store all of them
        unrated_movies.update(user_unrated_movies)
    return list(unrated_movies)

In [28]:
def kMostSimilarUsers(user, k, similarities_dict):
    user_similarities = similarities_dict[user]
    # Sort users by similarity coefficient in descending order
    sorted_user_similarities = sorted(user_similarities, key=user_similarities.get, reverse=True)
    # Extract the top k similar users and their corresponding similarity coefficients
    top_k_users = sorted_user_similarities[:k]
    return top_k_users

# Assignment 3

In [29]:
def generatePredictionOnUnratedMovies(user, unrated_movies, ratings_matrix, similarities_dict):
    predictions = []
    topK_simlar_users = kMostSimilarUsers(user, 50, similarities_dict)
    # Calculate the average of the user's ratings
    rmean_user = ratings_matrix.loc[user].mean()
    for item in unrated_movies:
        pred = generatePrediction(user, item, ratings_matrix, similarities_dict, rmean_user, topK_simlar_users)
        predictions.append((item, pred))
    return predictions

In [30]:
def topKMoviesHybrid(k, alfa, predictions, excluded_movies):
    # List to store the hybrid predictions
    hybrid_predictions = []

    # Iterate over each movie
    for i in range(len(predictions[0])):
        # Calculate the average rating for each movie
        avg_rating = sum(prediction[i][1] for prediction in predictions) / len(predictions)
        
        # Calculate the minimum rating for each movie
        min_rating = min(prediction[i][1] for prediction in predictions)

        # Build the list of tuples (movie_id, hybrid_rating, individual_ratings)
        movie_id = predictions[0][i][0]
        individual_ratings = [prediction[i][1] for prediction in predictions]
        if movie_id not in excluded_movies:
            hybrid_rating = (1 - alfa) * avg_rating + alfa * min_rating
            hybrid_predictions.append((movie_id, hybrid_rating, individual_ratings))
    
    if(len(hybrid_predictions) < k):
        k = len(hybrid_predictions)

    # Sort the list based on the hybrid rating (in descending order)
    sorted_hybrid_predictions = sorted(hybrid_predictions, key=lambda x: x[1], reverse=True)
    
    # Take only the first k elements from the sorted list
    top_k_hybrid_predictions = sorted_hybrid_predictions[:k]
    
    return top_k_hybrid_predictions

In [31]:
def computeUsersSatisfaction(group, group_top_k, user_top_k):

    usersListSat = [sum(rating for _, rating in elem) for elem in user_top_k]
    groupListSat = list(map(sum, zip(*[elem[2] for elem in group_top_k])))
    satisfactions = []
    for i in range(0,len(group)):
        num = groupListSat[i]
        den = usersListSat[i]
        sat = num/den
        satisfactions.append(sat)
    return satisfactions  

In [37]:
def topKMoviesHybrid1(k, alfa, predictions, excluded_movies):
    # List to store the hybrid predictions
    hybrid_predictions = []

    # Iterate over each movie
    for i in range(len(predictions[0])):
        # Calculate the average rating for each movie
        avg_rating = sum(prediction[i][1] for prediction in predictions) / len(predictions)
        
        # Calculate the standard deviation of ratings for each movie
        ratings = [prediction[i][1] for prediction in predictions]
        std_dev_rating = np.std(ratings)

        # Build the list of tuples (movie_id, hybrid_rating, individual_ratings)
        movie_id = predictions[0][i][0]
        individual_ratings = [prediction[i][1] for prediction in predictions]
        if movie_id not in excluded_movies:
            hybrid_rating = (1 - alfa) * avg_rating + alfa * std_dev_rating
            hybrid_predictions.append((movie_id, hybrid_rating, individual_ratings))
    
    if(len(hybrid_predictions) < k):
        k = len(hybrid_predictions)

    # Sort the list based on the hybrid rating (in descending order)
    sorted_hybrid_predictions = sorted(hybrid_predictions, key=lambda x: x[1], reverse=True)
    
    # Take only the first k elements from the sorted list
    top_k_hybrid_predictions = sorted_hybrid_predictions[:k]
    
    return top_k_hybrid_predictions


In [38]:
def sequentialGroupPrediction(group, k, n_iter):
    similarities_dict = compute_all_user_similarities()
    # union of unrated movies
    unrated_movies = get_all_unrated_movies(group)
    #ratings matrix
    ratings_matrix = create_user_movie_rating_matrix()
    # Compute user similarities for each user in the group 
    predictions = []
    user_top_k = []   #lista di coppie (item,voto)
    for user in group:
        user_predictions = generatePredictionOnUnratedMovies(user, unrated_movies,ratings_matrix,similarities_dict)
        predictions.append(user_predictions)
        sorted_predictions = sorted(user_predictions, key=lambda x: x[1], reverse=True)
        top_k = sorted_predictions[:k]
        user_top_k.append(top_k)
        
    alfa = 0
    excluded_movies = []
    # lista dei top k film per il gruppo
    group_top_k = [] 
    for i in range(0,n_iter):
        group_top_k = topKMoviesHybrid1(k, alfa, predictions, excluded_movies)
        users_satisfactions = computeUsersSatisfaction(group, group_top_k, user_top_k)
        print("Iteration:",i, " alfa =",alfa)
        print(f"The top {k} movies recommended for the group {group} are:")
        i = 1
        for movie_id, pred, _ in group_top_k:
            movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
            print(f"{i}. MovieID: {movie_id}, Title: {movie_title}, Score: {pred}")
            excluded_movies.append(movie_id)
        for i in range(0, len(group)):
            print(f"User {group[i]}'s statisfaction: ", users_satisfactions[i])
        print("-------------------------------------------------------------------")
        alfa = max(users_satisfactions) - min(users_satisfactions)

In [36]:
group = [59,125,471]
sequentialGroupPrediction(group,10,3)

Iteration: 0  alfa = 0
The top 10 movies recommended for the group [59, 125, 471] are:
1. MovieID: 1327, Title: Amityville Horror, The (1979), Score: 5.791721570373908
1. MovieID: 2700, Title: South Park: Bigger, Longer and Uncut (1999), Score: 5.7140822560550015
1. MovieID: 1350, Title: Omen, The (1976), Score: 5.6638891536663545
1. MovieID: 2513, Title: Pet Sematary (1989), Score: 5.605384360481889
1. MovieID: 1248, Title: Touch of Evil (1958), Score: 5.457675224386862
1. MovieID: 1231, Title: Right Stuff, The (1983), Score: 5.438324800273521
1. MovieID: 3972, Title: Legend of Drunken Master, The (Jui kuen II) (1994), Score: 5.393851867385784
1. MovieID: 1924, Title: Plan 9 from Outer Space (1959), Score: 5.355094945391147
1. MovieID: 2583, Title: Cookie's Fortune (1999), Score: 5.218396254855086
1. MovieID: 3543, Title: Diner (1982), Score: 5.165398261006952
User 59's statisfaction:  0.9113830743121526
User 125's statisfaction:  0.9273836153306404
User 471's statisfaction:  0.806434

In [39]:
group = [59,125,471]
sequentialGroupPrediction(group,10,3)

Iteration: 0  alfa = 0
The top 10 movies recommended for the group [59, 125, 471] are:
1. MovieID: 1327, Title: Amityville Horror, The (1979), Score: 5.791721570373908
1. MovieID: 2700, Title: South Park: Bigger, Longer and Uncut (1999), Score: 5.7140822560550015
1. MovieID: 1350, Title: Omen, The (1976), Score: 5.6638891536663545
1. MovieID: 2513, Title: Pet Sematary (1989), Score: 5.605384360481889
1. MovieID: 1248, Title: Touch of Evil (1958), Score: 5.457675224386862
1. MovieID: 1231, Title: Right Stuff, The (1983), Score: 5.438324800273521
1. MovieID: 3972, Title: Legend of Drunken Master, The (Jui kuen II) (1994), Score: 5.393851867385784
1. MovieID: 1924, Title: Plan 9 from Outer Space (1959), Score: 5.355094945391147
1. MovieID: 2583, Title: Cookie's Fortune (1999), Score: 5.218396254855086
1. MovieID: 3543, Title: Diner (1982), Score: 5.165398261006952
User 59's statisfaction:  0.9113830743121526
User 125's statisfaction:  0.9273836153306404
User 471's statisfaction:  0.806434