In [17]:
import pandas as pd
import math
import numpy as np

In [18]:
ratings = pd.read_csv("dataset/ratings.csv")
n_rows = ratings.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(ratings.head(10).to_string(index=False))

Number of rows: 100836
 userId  movieId  rating  timestamp
      1        1     4.0  964982703
      1        3     4.0  964981247
      1        6     4.0  964982224
      1       47     5.0  964983815
      1       50     5.0  964982931
      1       70     3.0  964982400
      1      101     5.0  964980868
      1      110     4.0  964982176
      1      151     5.0  964984041
      1      157     5.0  964984100


In [19]:
movies = pd.read_csv("dataset/movies.csv")
n_rows = movies.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(movies.head(10).to_string(index=False))

Number of rows: 9742
 movieId                              title                                      genres
       1                   Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
       2                     Jumanji (1995)                  Adventure|Children|Fantasy
       3            Grumpier Old Men (1995)                              Comedy|Romance
       4           Waiting to Exhale (1995)                        Comedy|Drama|Romance
       5 Father of the Bride Part II (1995)                                      Comedy
       6                        Heat (1995)                       Action|Crime|Thriller
       7                     Sabrina (1995)                              Comedy|Romance
       8                Tom and Huck (1995)                          Adventure|Children
       9                Sudden Death (1995)                                      Action
      10                   GoldenEye (1995)                   Action|Adventure|Thriller


In [20]:
user_movie_ratings_matrix = pd.pivot_table(ratings, values='rating', index='userId', columns='movieId')
all_movie_ids = movies['movieId'].unique()
user_movie_ratings_matrix = user_movie_ratings_matrix.reindex(columns=all_movie_ids)
user_movie_ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [21]:
def pearsonCorrelation(user1, user2):
    num, den1, den2 = 0.0, 0.0, 0.0
    ratings_user1 = ratings[ratings['userId'] == user1]
    rmean_user1 = ratings_user1['rating'].mean()
    ratings_user2 = ratings[ratings['userId'] == user2]
    rmean_user2 = ratings_user2['rating'].mean()

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    for p in common_items:
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - rmean_user1)*(r_2p - rmean_user2)
        den1 += pow((r_1p - rmean_user1), 2)
        den2 += pow((r_2p - rmean_user2), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    if den == 0.0:
        return 0.0
    sim = num/den
    return sim

In [34]:
def all_users_similarities():
    user_similarities = {}
    all_user_ids = ratings['userId'].unique()
    for i, user1 in enumerate(all_user_ids):
        print(i)
        user_similarities[user1] = {}
        for j in range(i, len(all_user_ids)):
            user2 = all_user_ids[j]
            if i == j:
                similarity = 1.0  # Similarità tra un utente e se stesso è sempre 1
            else:
                common_movies = set(ratings[ratings['userId'] == user1]['movieId']).intersection(set(ratings[ratings['userId'] == user2]['movieId']))
                if not common_movies:
                    similarity = 0.0  # Nessun film in comune, similarità 0
                else:
                    similarity = pearsonCorrelation(user1,user2)
            user_similarities[user1][user2] = similarity
            user_similarities[user2][user1] = similarity  # Aggiungi la correlazione simmetrica
    return user_similarities

In [35]:
similarity_dict = all_users_similarities()
similarity_dict

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


KeyboardInterrupt: 

In [7]:
def generatePrediction(user1, item, user_movie_ratings_matrix, user_similarity_dict):
    # Aggiungi controllo che il film non è stato valutato
    if not np.isnan(user_movie_ratings_matrix.at[user1, item]):
        return user_movie_ratings_matrix.at[user1, item]
    
    # Calcola la previsione utilizzando la matrice delle valutazioni e il dizionario delle similarità
    num, den = 0.0, 0.0
    rmean_user1 = user_movie_ratings_matrix.loc[user1].mean()
    
    users_who_rated_item = user_movie_ratings_matrix[item].dropna().index.unique()
    for u in users_who_rated_item:
        rmean_u = user_movie_ratings_matrix.loc[u].mean()
        r_up = user_movie_ratings_matrix.at[u, item]
        similarity = user_similarity_dict[u]
        num += similarity * (r_up - rmean_u)
        den += abs(similarity)
    if den == 0.0:
        pred = rmean_user1
    else:
        pred = rmean_user1 + (num / den)
    return pred

In [10]:
def generateCompleteMatrix(user_movie_ratings_matrix):
    for user in user_movie_ratings_matrix.index:
        print(user)
        user_similarity_dict = compute_user_similarities(user)
        for movie in user_movie_ratings_matrix.columns:
            if pd.isna(user_movie_ratings_matrix.loc[user, movie]):
                prediction = generatePrediction(user, movie, user_movie_ratings_matrix, user_similarity_dict)
                user_movie_ratings_matrix.loc[user, movie] = prediction

In [11]:
completeMatrix = generateCompleteMatrix(user_movie_ratings_matrix)

1


KeyboardInterrupt: 

### (a) For producing group recommendation, we will use the user-based collaborative filtering approach as this implemented in Assignment 1. Specifically, for producing group recommendations, we will first compute the movies recommendations for each user in the group, and then we will aggregate the lists of the individual users, so as to produce a single list of movies for the group. You will implement two well established aggregation methods for producing the group recommendations.

### The first aggregation approach is the <em>average method</em>. The main idea behind this approach is that all members are considered equals. So, the rating of an item for a group of users will be given be averaging the scores of an item across all group members.

In [76]:
def averageMethod(group, item):
    scores_sum = 0.0
    for user in group:
        user_similarity_dict = compute_user_similarities(user)
        rating = generatePrediction(user, item,user_movie_ratings_matrix,user_similarity_dict)
        scores_sum += rating
    pred = scores_sum / len(group)
    return pred

In [77]:
print(averageMethod([1,4,9,18],50))

4.317527248350769


### The second aggregation method is the <em>least misery method</em>, where one member can act as a veto for the rest of the group. In this case, the rating of an item for a group of users is computed as the minimum score assigned to that item in all group members recommendations.

In [80]:
def leastMiseryMethod(group, item):
    scores = []
    for user in group:
        user_similarity_dict = compute_user_similarities(user)
        rating = generatePrediction(user, item,user_movie_ratings_matrix,user_similarity_dict)
        scores.append(rating)
    pred = min(scores)
    return pred

In [81]:
print(leastMiseryMethod([1,4,9,18],50))

3.6090081951346242


### Produce a group of 3 users, and for this group, show the top-10 recommendations, i.e., the 10 movies with the highest prediction scores that (i) the average method suggests, and (ii) the least misery method suggest. Use the MovieLens 100K rating dataset.

In [82]:
def topKMovies(group,k):
    all_movie_ids = movies['movieId'].tolist()
    predictions = []
    for m in all_movie_ids:
        print("m: "+str(m))
        pred = averageMethod(group, m)
        predictions.append((m, pred)) 
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    top_k_predictions = sorted_predictions[:k]
    return top_k_predictions

In [84]:
print(topKMovies([1,18,23],10))

m: 1
m: 2


KeyboardInterrupt: 

In [85]:
import concurrent.futures

def topKMoviesPar(group, k):
    all_movie_ids = movies['movieId'].tolist()
    predictions = []
    
    def process_movie(m):
        print("m: " + str(m))
        pred = averageMethod(group, m)
        return (m, pred)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        predictions = list(executor.map(process_movie, all_movie_ids))
    
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    top_k_predictions = sorted_predictions[:k]
    return top_k_predictions