In [1]:
import pandas as pd
import math
from statistics import median

### (a) Download the MovieLens 100K rating dataset from https://grouplens.org/datasets/movielens/ (the small dataset recommended for education and development). Read the dataset, display the first few rows to understand it, and display the count of ratings (rows) in the dataset to be sure that you download it correctly.

In [2]:
ratings = pd.read_csv("dataset/ratings.csv")
n_rows = ratings.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(ratings.head(10).to_string(index=False))

Number of rows: 100836
 userId  movieId  rating  timestamp
      1        1     4.0  964982703
      1        3     4.0  964981247
      1        6     4.0  964982224
      1       47     5.0  964983815
      1       50     5.0  964982931
      1       70     3.0  964982400
      1      101     5.0  964980868
      1      110     4.0  964982176
      1      151     5.0  964984041
      1      157     5.0  964984100


In [3]:
movies = pd.read_csv("dataset/movies.csv")
n_rows = movies.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(movies.head(10).to_string(index=False))

Number of rows: 9742
 movieId                              title                                      genres
       1                   Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
       2                     Jumanji (1995)                  Adventure|Children|Fantasy
       3            Grumpier Old Men (1995)                              Comedy|Romance
       4           Waiting to Exhale (1995)                        Comedy|Drama|Romance
       5 Father of the Bride Part II (1995)                                      Comedy
       6                        Heat (1995)                       Action|Crime|Thriller
       7                     Sabrina (1995)                              Comedy|Romance
       8                Tom and Huck (1995)                          Adventure|Children
       9                Sudden Death (1995)                                      Action
      10                   GoldenEye (1995)                   Action|Adventure|Thriller


### (b) Implement the user-based collaborative filtering approach, using the Pearson correlation function for computing similarities between users, and

$$sim(a, b) = \frac{\sum_{p \in P} (r_{a,p} - \bar{r_a})(r_{b,p} - \bar{r_b})}{\sqrt{\sum_{p \in P}(r_{a,p} - \bar{r_a})^2}\sqrt{\sum_{p \in P}(r_{b,p} - \bar{r_b})^2}}$$

In [4]:
num_users = ratings["userId"].nunique()
print(num_users)

610


In [5]:
def pearsonCorrelation(user1, user2):
    num, den1, den2 = 0, 0, 0
    ratings_user1 = ratings[ratings['userId'] == user1]
    rmean_user1 = ratings_user1['rating'].mean()
    ratings_user2 = ratings[ratings['userId'] == user2]
    rmean_user2 = ratings_user2['rating'].mean()

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    for p in common_items:
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - rmean_user1)*(r_2p - rmean_user2)
        den1 += pow((r_1p - rmean_user1), 2)
        den2 += pow((r_2p - rmean_user2), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    if den == 0:
        return 0.0
    sim = num/den
    return sim

In [6]:
corr = pearsonCorrelation(3,9)

print(corr)

0.0


### (c) the prediction function presented in class for predicting movies scores.

$$pred(a,p)=\bar{r_a} + \frac{\sum_{b \in N}sim(a,b)*(r_{b,p}-\bar{r_b})}{\sum_{b \in N}sim(a,b)}$$

In [7]:
def generatePrediction(user1, item):
    #aggiungi controllo che il film non è stato valutato
    num, den = 0, 0
    ratings_user1 = ratings[ratings['userId'] == user1]
    if item in ratings_user1['movieId'].tolist():
        return "This movie has been rated by the user"
    rmean_user1 = ratings_user1['rating'].mean()
    # set of users who rated item 
    users = ratings[ratings['movieId'] == item]['userId'].unique()
    for u in users:
        ratings_u = ratings[ratings['userId'] == u]
        rmean_u = ratings_u['rating'].mean()
        r_up = (ratings_u[(ratings_u['movieId'] == item)]['rating'].values)[0]
        similarity = pearsonCorrelation(user1, u)
        num += similarity*(r_up - rmean_u)
        den += abs(similarity)
    if den == 0:
        pred = rmean_user1
    else:
        pred = rmean_user1 + (num/den)
    return pred

In [8]:
print(generatePrediction(1,2149))

3.7602052225633527


### (d) Select a user from the dataset, and for this user, show the 10 most similar users and the 10 most relevant movies that the recommender suggests.

In [9]:
def kMostSimilarUsers(user, k):
    users_similarities = []
    for u in range(1,num_users):
        if u != user:
            sim = pearsonCorrelation(user, u)
            users_similarities.append((u,sim))
    sorted_users_similarities = sorted(users_similarities, reverse=True, key=lambda x: x[1])
    top_k_users = [pair[0] for pair in sorted_users_similarities[:k]]
    top_k_sim = [pair[1] for pair in sorted_users_similarities[:k]]
    return top_k_users, top_k_sim


In [10]:
print(kMostSimilarUsers(3, 10))

([14, 34, 55, 65, 73, 75, 80, 82, 98, 121], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])


In [11]:
def get_unrated_movie_ids(userId):
    # Ottieni tutti i movieId unici presenti nel DataFrame ratings
    all_movie_ids = movies['movieId'].tolist()
    
    # Ottieni i movieId valutati dal userId specificato
    rated_movie_ids = ratings[ratings['userId'] == userId]['movieId'].tolist()
    
    # Trova i movieId non valutati dal userId
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    
    return unrated_movie_ids

In [12]:
unrated_movies = get_unrated_movie_ids(1)
print(unrated_movies)

[2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 57, 58, 60, 61, 62, 63, 64, 65, 66, 68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 92, 93, 94, 95, 96, 97, 99, 100, 102, 103, 104, 105, 106, 107, 108, 111, 112, 113, 116, 117, 118, 119, 121, 122, 123, 125, 126, 128, 129, 132, 135, 137, 140, 141, 144, 145, 146, 147, 148, 149, 150, 152, 153, 154, 155, 156, 158, 159, 160, 161, 162, 164, 165, 166, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 198, 199, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 217, 218, 219, 220, 222, 224, 225, 227, 228, 229, 230, 232, 233, 234, 236, 237, 238, 239, 240, 241, 242, 243, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 261, 262, 263, 265, 266, 267, 269, 270, 27

In [13]:
def kMostRelevantMovies(user, k):
    unrated_movies = get_unrated_movie_ids(user)
    predictions = []
    for m in unrated_movies:
        print("m: "+str(m))
        pred = generatePrediction(user, m)
        predictions.append((m, pred)) 
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    top_k_predictions = sorted_predictions[:k]
    return top_k_predictions

In [14]:
from prediction_utils import calculate_prediction

In [15]:
from concurrent.futures import ThreadPoolExecutor

def kMostRelevantMoviesPar(user, k):
    unrated_movies = get_unrated_movie_ids(user)
    predictions = []

    # Funzione per calcolare la previsione in parallelo
    def calculate_prediction(movie_id):
        return movie_id, generatePrediction(user, movie_id)
    
    # Usa un ThreadPoolExecutor per eseguire i calcoli in parallelo
    with ThreadPoolExecutor() as executor:
        # Mappa ogni film non valutato alla sua previsione e raccogli i risultati
        futures = [executor.submit(calculate_prediction, m) for m in unrated_movies]
        for future in futures:
            movie_id, pred = future.result()
            print(movie_id)
            predictions.append((movie_id, pred))

    # Ordina le previsioni e seleziona le prime k
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    top_k_predictions = sorted_predictions[:k]
    return top_k_predictions


In [16]:
print(kMostRelevantMoviesPar(1,10))

2
4
5
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
34
36
38
39
40
41
42
43
44
45
46
48
49
52
53
54
55
57
58
60
61
62
63
64
65
66
68
69
71
72
73
74
75
76
77
78
79
80
81
82
83
85
86
87
88
89
92
93
94
95
96
97
99
100
102
103
104
105
106
107
108
111
112
113
116
117
118
119
121
122
123
125
126
128
129
132
135
137
140
141
144
145
146
147
148
149
150
152
153
154
155
156
158
159
160
161
162
164
165
166
168
169
170
171
172
173
174
175
176
177
178
179
180
181
183
184
185
186
187
188
189
190
191
193
194
195
196
198
199
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
217
218
219
220
222
224
225
227
228
229
230
232
233
234
236
237
238
239
240
241
242
243
246
247
248
249
250
251
252
253
254
255
256
257
258
259
261
262
263
265
266
267
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
287
288
289
290
291
292
293
294
295
298
299
300
301
302
303
304
305
306
307
308
310
311
312
313
314
315
317
318
319
320
321
322
324
325
326
327
328
329
330
331


### (e) Design and implement a new similarity function for computing similarities between users. Explain why this similarity function is useful for the collaborative filtering approach. Hint: Exploiting ideas from related works are highly encouraged.

$$sim(a, b)^{CPCC} = \frac{\sum_{p \in P} (r_{a,p} - {r_{med}})(r_{b,p} - {r_{med}})}{\sqrt{\sum_{p \in P}(r_{a,p} - {r_{med}})^2}\sqrt{\sum_{p \in P}(r_{b,p} - {r_{med}})^2}}$$

In [17]:
def constrainedPearsonCorrelation(user1, user2):
    num, den1, den2 = 0, 0, 0
    possible_ratings = ratings['rating'].unique().tolist()
    possible_ratings.sort()
    median_value = median(possible_ratings)

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    for p in common_items:
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - median_value)*(r_2p - median_value)
        den1 += pow((r_1p - median_value), 2)
        den2 += pow((r_2p - median_value), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    if den == 0:
        return 0.0
    sim = num/den
    return sim