In [1]:
import pandas as pd
import math
from statistics import median
import numpy as np

### (a) Download the MovieLens 100K rating dataset from https://grouplens.org/datasets/movielens/ (the small dataset recommended for education and development). Read the dataset, display the first few rows to understand it, and display the count of ratings (rows) in the dataset to be sure that you download it correctly.

In [2]:
ratings = pd.read_csv("dataset/ratings.csv")
n_rows = ratings.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(ratings.head(10).to_string(index=False))

Number of rows: 100836
 userId  movieId  rating  timestamp
      1        1     4.0  964982703
      1        3     4.0  964981247
      1        6     4.0  964982224
      1       47     5.0  964983815
      1       50     5.0  964982931
      1       70     3.0  964982400
      1      101     5.0  964980868
      1      110     4.0  964982176
      1      151     5.0  964984041
      1      157     5.0  964984100


In [3]:
movies = pd.read_csv("dataset/movies.csv")
n_rows = movies.shape[0]

print("Number of rows: " + str(n_rows))
# Show the first 10 rows of the dataframe
print(movies.head(10).to_string(index=False))

Number of rows: 9742
 movieId                              title                                      genres
       1                   Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
       2                     Jumanji (1995)                  Adventure|Children|Fantasy
       3            Grumpier Old Men (1995)                              Comedy|Romance
       4           Waiting to Exhale (1995)                        Comedy|Drama|Romance
       5 Father of the Bride Part II (1995)                                      Comedy
       6                        Heat (1995)                       Action|Crime|Thriller
       7                     Sabrina (1995)                              Comedy|Romance
       8                Tom and Huck (1995)                          Adventure|Children
       9                Sudden Death (1995)                                      Action
      10                   GoldenEye (1995)                   Action|Adventure|Thriller


In [4]:
def create_user_movie_rating_matrix():
    user_movie_ratings_matrix = pd.pivot_table(ratings, values='rating', index='userId', columns='movieId')
    all_movie_ids = movies['movieId'].unique()
    user_movie_ratings_matrix = user_movie_ratings_matrix.reindex(columns=all_movie_ids)
    return user_movie_ratings_matrix

In [6]:
user_movie_ratings_matrix = create_user_movie_rating_matrix()
user_movie_ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


### (b) Implement the user-based collaborative filtering approach, using the Pearson correlation function for computing similarities between users, and

$$sim(a, b) = \frac{\sum_{p \in I} (r_{a,p} - \bar{r_a})(r_{b,p} - \bar{r_b})}{\sqrt{\sum_{p \in I}(r_{a,p} - \bar{r_a})^2}\sqrt{\sum_{p \in I}(r_{b,p} - \bar{r_b})^2}}$$

In [7]:
num_users = ratings["userId"].nunique()
print(num_users)

610


In [8]:
def pearsonCorrelation(user1, user2):
    num, den1, den2 = 0.0, 0.0, 0.0
    ratings_user1 = ratings[ratings['userId'] == user1]
    rmean_user1 = ratings_user1['rating'].mean()
    ratings_user2 = ratings[ratings['userId'] == user2]
    rmean_user2 = ratings_user2['rating'].mean()

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    for p in common_items:
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - rmean_user1)*(r_2p - rmean_user2)
        den1 += pow((r_1p - rmean_user1), 2)
        den2 += pow((r_2p - rmean_user2), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    if den == 0.0:
        return 0.0
    sim = num/den
    return sim

In [9]:
corr = pearsonCorrelation(1,21)

print(corr)

0.08648178161396775


In [10]:
def compute_user_similarities(user):
    user_list = list(range(1, 611, 1))
    user_list.remove(user)
    user_similarities = {}
    for other_user in user_list:
        if other_user != user:
            similarity = pearsonCorrelation(user, other_user)
            user_similarities[other_user] = similarity
    return user_similarities

In [21]:
user_similarities = compute_user_similarities(1)

### (c) the prediction function presented in class for predicting movies scores.

$$pred(a, p) = \bar{r_a} + \frac{\sum_{b \in N} sim(a, b) * (r_{b,p} - \bar{r_b})} {\sum_{b \in N}sim(a, b)}$$

In [11]:
def generatePrediction(user1, item, user_movie_ratings_matrix, user_similarity_dict):
    # Aggiungi controllo che il film non è stato valutato
    if not np.isnan(user_movie_ratings_matrix.at[user1, item]):
        return user_movie_ratings_matrix.at[user1, item]
    
    # Calcola la previsione utilizzando la matrice delle valutazioni e il dizionario delle similarità
    num, den = 0.0, 0.0
    rmean_user1 = user_movie_ratings_matrix.loc[user1].mean()
    
    users_who_rated_item = user_movie_ratings_matrix[item].dropna().index.unique()
    for u in users_who_rated_item:
        rmean_u = user_movie_ratings_matrix.loc[u].mean()
        r_up = user_movie_ratings_matrix.at[u, item]
        similarity = user_similarity_dict[u]
        num += similarity * (r_up - rmean_u)
        den += abs(similarity)
    if den == 0.0:
        pred = rmean_user1
    else:
        pred = rmean_user1 + (num / den)
    return pred

In [20]:
print(generatePrediction(1,5105,create_user_movie_rating_matrix(),compute_user_similarities(1)))

7.790948275862069


### (d) Select a user from the dataset, and for this user, show the 10 most similar users and the 10 most relevant movies that the recommender suggests.

In [187]:
def kMostSimilarUsers(user, k):
    users_similarities = []
    for u in range(1,num_users):
        if u != user:
            sim = pearsonCorrelation(user, u)
            users_similarities.append((u,sim))
    sorted_users_similarities = sorted(users_similarities, reverse=True, key=lambda x: x[1])
    top_k_users = [pair[0] for pair in sorted_users_similarities[:k]]
    top_k_sim = [pair[1] for pair in sorted_users_similarities[:k]]
    return top_k_users, top_k_sim

In [188]:
print(kMostSimilarUsers(3, 10))

([14, 34, 55, 65, 73, 75, 80, 82, 98, 121], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])


In [14]:
def get_unrated_movie_ids(userId):
    # Ottieni tutti i movieId unici presenti nel DataFrame ratings
    all_movie_ids = movies['movieId'].tolist()
    
    # Ottieni i movieId valutati dal userId specificato
    rated_movie_ids = ratings[ratings['userId'] == userId]['movieId'].tolist()
    
    # Trova i movieId non valutati dal userId
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    
    return unrated_movie_ids

In [22]:
unrated_movies = get_unrated_movie_ids(1)

In [23]:
def kMostRelevantMovies(user, k):
    user_movie_ratings_matrix = create_user_movie_rating_matrix()
    unrated_movies = get_unrated_movie_ids(user)
    predictions = []
    user_similarities_dict = compute_user_similarities(user)
    for m in unrated_movies:
        pred = generatePrediction(user, m, user_movie_ratings_matrix,user_similarities_dict)
        predictions.append((m, pred)) 
    sorted_predictions = sorted(predictions, reverse=True, key=lambda x: x[1])
    top_k_predictions = sorted_predictions[:k]
    return top_k_predictions

In [24]:
print(kMostRelevantMovies(1,10))

[(5105, 7.790948275862069), (6967, 7.790948275862069), (7114, 7.790948275862069), (7742, 7.790948275862069), (175475, 7.571603190941842), (184641, 7.571603190941842), (168712, 7.462533156498673), (3604, 7.3393522833178), (97024, 7.144950738916256), (40491, 7.120924764890281)]


### (e) Design and implement a new similarity function for computing similarities between users. Explain why this similarity function is useful for the collaborative filtering approach. Hint: Exploiting ideas from related works are highly encouraged.

#### Constrained Pearson Correlation Coefficient (CPCC)
$$sim(a, b)^{CPCC} = \frac{\sum_{p \in I} (r_{a,p} - r_{med})(r_{b,p} - r_{med})}{\sqrt{\sum_{p \in I}(r_{a,p} - r_{med})^2}\sqrt{\sum_{p \in I}(r_{b,p} - r_{med})^2}}$$

In [193]:
def constrainedPearsonCorrelation(user1, user2):
    num, den1, den2 = 0.0, 0.0, 0.0
    possible_ratings = ratings['rating'].unique().tolist()
    possible_ratings.sort()
    print(possible_ratings)
    median_value = median(possible_ratings)
    print(median_value)

    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)

    for p in common_items:
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - median_value)*(r_2p - median_value)
        den1 += pow((r_1p - median_value), 2)
        den2 += pow((r_2p - median_value), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    if den == 0.0:
        return 0.0
    sim = num/den
    return sim

In [194]:
corr = constrainedPearsonCorrelation(1,21)

print(corr)

[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
2.75
0.6593415600314004


#### Weighted Pearson Correlation Coefficient (WPCC)
$$ sim(a,b)^{WPCC}=\begin{cases} sim(a,b)^{PCC}*\frac{|I|}{H}, & |I|\le{H} \\ sim(a,b)^{PCC}, & otherwise
\end{cases} $$
##### where I represents the set of common rating items by user u and v and H is an experimental value and it is set 50 

In [195]:
def weightedPearsonCorrelation(user1, user2):
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)
    
    similarity = pearsonCorrelation(user1, user2)
    I = len(common_items)
    H = 50
    if I <= H:
        sim = similarity * (I/H)
    else:
        sim = similarity
    return sim

In [196]:
corr = weightedPearsonCorrelation(1,21)

print(corr)

0.08129287471712968


#### Sigmoid Function based Pearson correlation coefficient (SPCC) 
$$sim(a, b)^{SPCC} = sim(a, b)^{PCC} * \frac{1}{1 + \exp{-(\frac{|I|}{2})}}$$

In [197]:
def sigmoidFunctionBasedPearsonCorrelation(user1, user2):
    similarity = pearsonCorrelation(user1,user2)
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)
    I = len(common_items)
    sim = similarity * (1/(1+math.exp(-(I/2))))
    return sim

In [198]:
corr = sigmoidFunctionBasedPearsonCorrelation(1,21)

print(corr)

0.08648178160858498


####  Adjusted Cosine Measure
$$sim(a, b)^{ACOS} = \frac{\sum_{p \in P} (r_{a,p} - \bar{r_a})(r_{b,p} - \bar{r_b})}{\sqrt{\sum_{p \in P}(r_{a,p} - \bar{r_a})^2}\sqrt{\sum_{p \in P}(r_{b,p} - \bar{r_b})^2}}$$
##### where $P$ is the set of all items. If user $u$ has not rated the item $p∈P$, the rating $r_{u,p}$ is zero.

In [199]:
def adjustedCosineMeasure(user1, user2):
    num, den1, den2 = 0.0, 0.0, 0.0
    ratings_user1 = ratings[ratings['userId'] == user1]
    rmean_user1 = ratings_user1['rating'].mean()
    ratings_user2 = ratings[ratings['userId'] == user2]
    rmean_user2 = ratings_user2['rating'].mean()

    all_items = movies['movieId'].tolist()
    
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])

    for p in all_items:
        if p not in items_user1:
            r_1p = 0.0
        else:
            r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        if p not in items_user2:
            r_2p = 0.0
        else:
            r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += (r_1p - rmean_user1)*(r_2p - rmean_user2)
        den1 += pow((r_1p - rmean_user1), 2)
        den2 += pow((r_2p - rmean_user2), 2)
    den = ((math.sqrt(den1))*(math.sqrt(den2)))
    if den == 0.0:
        return 0.0
    sim = num/den
    return sim

In [200]:
corr = adjustedCosineMeasure(1,21)

print(corr)

0.9671568115412114


#### Jaccard
$$sim(a, b)^{Jaccard} = \frac{|I_a| \cap {|I_b|}}{|I_a| \cup {|I_b|}}$$

In [201]:
def jaccard(user1, user2):
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])

    intersection = len(items_user1.intersection(items_user2))
    union = len(items_user1.union(items_user2))
    return intersection / union

In [202]:
corr = jaccard(1,21)

print(corr)

0.07484076433121019


#### Mean Squared Difference (MSD)
$$sim(a, b)^{MSD} = 1 - \frac{\sum_{p \in I} (r_{a,p} - r_{b,p})^2}{|I|}$$

In [203]:
def meanSquaredDifference(user1, user2):
    num = 0.0
    # set of movieIds evaluated by user1
    items_user1 = set(ratings[ratings['userId'] == user1]['movieId'])
    # set of movieIds evaluated by user2
    items_user2 = set(ratings[ratings['userId'] == user2]['movieId'])
    # set of movieIds evaluated by both user1 and user2 (intersection)
    common_items = items_user1.intersection(items_user2)
    I = len(common_items)

    for p in common_items:    
        r_1p = (ratings[(ratings['userId'] == user1) & (ratings['movieId'] == p)]['rating'].values)[0]
        r_2p = (ratings[(ratings['userId'] == user2) & (ratings['movieId'] == p)]['rating'].values)[0]
        num += pow((r_1p - r_2p), 2)
    
    if I != 0:
        sim = 1 - (num / I)
    else:
        sim = 0.0
    return sim
    

In [204]:
corr = meanSquaredDifference(1,21)

print(corr)

-0.7606382978723405


####  Jaccard and MSD can be combined to form a new metric. 
$$sim(a, b)^{JMSD} = sim(a, b)^{Jaccard}*sim(a, b)^{MSD}

In [205]:
def JaccardAndMSD(user1, user2):
    return jaccard(user1,user2)*meanSquaredDifference(user1,user2)

In [206]:
corr = JaccardAndMSD(1,21)

print(corr)

-0.05692675159235669
