# 4. Item-based Collaborative Filtering

## Data loading

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
RANDOM_STATE = 5482

We load the modified user database obtained in the data_generator.ipynb file. It contains data of only users with more than 5 listened songs. By doing this we reduce the noise added by users with only a few songs listened.

In [4]:
df_users = pd.read_csv('data/User Listening History_modified.csv')
df_users.shape

(8332242, 3)

In [3]:
df_music = pd.read_csv('data/Million Song Dataset kaggle/Music Info.csv')

In [5]:
num_distinct_users = df_users['user_id'].nunique()
num_distinct_users

464573

In [6]:
df_music_info = df_music[['track_id', 'name', 'artist', 'energy']]
df_music_info.head()

Unnamed: 0,track_id,name,artist,energy
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,0.918
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,0.892
2,TROUVHL128F426C441,Come as You Are,Nirvana,0.826
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,0.664
4,TRLNZBD128F935E4D8,Creep,Radiohead,0.43


In [7]:
df_music_count = df_music_info.merge(
    df_users.groupby('track_id').size().reset_index(name='playcount'),
    on='track_id',
    how='left'
)
df_music_count = df_music_count.fillna(0)
print(df_music_count.shape)
df_music_count.head()

(50683, 5)


Unnamed: 0,track_id,name,artist,energy,playcount
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,0.918,32.0
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,0.892,283.0
2,TROUVHL128F426C441,Come as You Are,Nirvana,0.826,0.0
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,0.664,17.0
4,TRLNZBD128F935E4D8,Creep,Radiohead,0.43,0.0


## Interaction Matrix

In [8]:
df_users_agg = df_users.groupby('user_id')['playcount'].agg(
    #total_playcount='sum',
    max_playcount='max'
).reset_index()
df_users_agg = df_users_agg.rename(columns={'playcount': 'max_playcount'})

In [9]:
df_users_rating = df_users.merge(df_users_agg, on='user_id')

We create 2 interaction matrix
- Binary version: 1 indicates a user has interacted with the song and 0 indicates he has not
- Weighted version: Rating in range (0,1], calculated as (user playcount) / (playcount of user's most listened song). Gives more rating to those more listened by a user

In [10]:
df_users_rating['rating'] = df_users_rating['playcount'] / df_users_rating['max_playcount']
df_users_rating.head()

Unnamed: 0,track_id,user_id,playcount,max_playcount,rating
0,TRIRLYL128F42539D1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,2,0.5
1,TRFUPBA128F934F7E1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,2,0.5
2,TRLQPQJ128F42AA94F,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,2,0.5
3,TRTUCUY128F92E1D24,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,2,0.5
4,TRHDDQG12903CB53EE,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,2,0.5


To create the interaction matrix, we can not reproduce the next line because it needs 218 GB of RAM to create the table

In [21]:
#df_music_pivoted = df_users.pivot(index='user_id', columns='track_id', values='rating').fillna(0)

Also, the data in the interaction matrix will be very sparse, due to users only listening to a small subset of the songs. Because of that, we will use a sparse matrix

In [None]:
from scipy.sparse import csc_matrix

We codify each user and song to a unique integer ID

In [13]:
user_codes, user_uniques = pd.factorize(df_users['user_id'])
track_codes, track_uniques = pd.factorize(df_users['track_id'])

We create the interaction matrix as a sparse csc matrix. We use csc and not scr because in item-based collaborative filtering we will be accessing the data by columns, which csc is optimized for.

In [14]:
interaction_matrix_csc_binary = csc_matrix((np.ones(len(df_users)), (user_codes, track_codes)),shape=(len(user_uniques), len(track_uniques)))
interaction_matrix_csc_weighted = csc_matrix((df_users_rating['rating'], (user_codes, track_codes)),shape=(len(user_uniques), len(track_uniques)))

In [15]:
interaction_matrix_csc_binary.shape

(464573, 30201)

In [22]:
memory_usage_bytes = (
    interaction_matrix_csc_weighted.data.nbytes +
    interaction_matrix_csc_weighted.indptr.nbytes +
    interaction_matrix_csc_weighted.indices.nbytes
)
memory_usage_mb = memory_usage_bytes / (1024 ** 2)
print(f"Disk space occupied by csc interaction matrix: {memory_usage_mb:.2f} MB")

Disk space occupied by csc interaction matrix: 95.47 MB


The sparse matrix has a size of only 95.47 MB, while the normal matrix as a dataframe was 218 GB

In [19]:
interaction_matrix_csc_binary[:10, :10].todense()

matrix([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [20]:
interaction_matrix_csc_weighted[:10, :10].todense()

matrix([[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1. , 0.5],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])

## Item based collaborative filtering model

We use cosine similarity as a similarity metric. We can precalculate the similarities.

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csc_matrix

# Calculate item-based (track-based) cosine similarity
# Transpose interaction_matrix_weighted to get items as rows
item_based_similarity = cosine_similarity(interaction_matrix_csc_weighted.T, dense_output=False)
item_based_similarity = csc_matrix(item_based_similarity)

Only 7 seconds to compute

In [23]:
memory_usage_bytes = (
    item_based_similarity.data.nbytes +
    item_based_similarity.indptr.nbytes +
    item_based_similarity.indices.nbytes
)
memory_usage_mb = memory_usage_bytes / (1024 ** 2)
print(f"Disk space occupied by item_based_similarity matrix: {memory_usage_mb:.2f} MB")

Disk space occupied by item_based_similarity matrix: 934.39 MB


In [24]:
item_based_similarity.shape

(30201, 30201)

The size of the matrix is ($num_{songs}$ x $num_{songs}$)

In [26]:
class ItemBasedRecommender:
    def __init__(self, interaction_matrix, item_similarity, track_uniques, df_music_info):
        self.interaction_matrix = interaction_matrix
        self.item_similarity = item_similarity
        self.track_uniques = track_uniques
        self.df_music_info = df_music_info
        self.user_index = None
        self.recommendations = None 

    def make_recommendations(self, user_index, n=100):
        self.user_index = user_index
        user_ratings = self.interaction_matrix[self.user_index]
        interacted = user_ratings.nonzero()[1]
        similarities = self.item_similarity.dot(user_ratings.T).toarray().ravel()
        similarities[interacted] = -np.inf
        top_n_index = np.argpartition(similarities, -n)[-n:]
        top_n_index = top_n_index[np.argsort(similarities[top_n_index])[::-1]]


        track_ids = self.track_uniques[top_n_index].tolist()
        df_filtered = self.df_music_info.set_index('track_id').loc[track_ids][['energy']].reset_index()
        index_to_similarity = {idx: similarities[idx] for idx in top_n_index}

        self.recommendations = [(track_id, energy, index_to_similarity[self.track_uniques.tolist().index(track_id)], False) for track_id, energy in df_filtered.itertuples(index=False, name=None)]

    
    def recommend_song(self, energy, energy_margin=0.05):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        
        closest_track_index = None
        distance_to_energy = float('inf')

        for i, (track_id, track_energy, similarity, has_been_recommended) in enumerate(self.recommendations):
            distance = abs(track_energy - energy)

            if not has_been_recommended and distance <= energy_margin:
                self.recommendations[i] = (track_id, track_energy, similarity, True)
                return (track_id, track_energy)
            
            if not has_been_recommended and distance < distance_to_energy:
                closest_track_index = i
                distance_to_energy = distance
        
        if closest_track_index is not None:
            track_id, track_energy, _, _= self.recommendations[closest_track_index]
            self.recommendations[closest_track_index] = (track_id, track_energy, similarity, True)
            return (track_id, track_energy)

        raise ValueError("All recommendations have already been recommended")


    def get_recommendations(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return self.recommendations


    def get_recommendations_ids(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return [track_id for track_id, _, _, _ in self.recommendations]
    
    def get_recommendations_info(self):
        track_ids_ordered = [track_id for track_id, _, _, _ in self.recommendations]
        df_ordered = self.df_music_info.set_index('track_id').loc[track_ids_ordered].reset_index()
        return df_ordered

## Example

In [36]:
user_index = 0 #User for wich recommendations will be made

Songs the user has listened to

In [37]:
user_listened_songs = df_users[df_users['user_id'] == user_uniques[user_index]].track_id
df_music_info[df_music_info['track_id'].isin(user_listened_songs)]

Unnamed: 0,track_id,name,artist,energy
79,TRAAHSY128F147BB5C,Speed of Sound,Coldplay,0.898
796,TRDLMWP128F426BF6C,Ragged Wood,Fleet Foxes,0.685
1158,TRPUGUW128F426BF6F,He Doesn't Know Why,Fleet Foxes,0.558
1743,TRRYCBO128F932A2C7,Love Shack,The B-52's,0.902
2350,TRFUPBA128F934F7E1,Paper Gangsta,Lady Gaga,0.53
2585,TRVODUZ128F934D094,All That We Perceive,Thievery Corporation,0.834
2722,TRHDDQG12903CB53EE,Heaven's Gonna Burn Your Eyes,Thievery Corporation,0.337
5416,TRTUCUY128F92E1D24,Stacked Actors,Foo Fighters,0.934
5428,TRLXSNR128F429361D,Generator,Foo Fighters,0.924
5819,TRADVZX128F426BF79,Sun Giant,Fleet Foxes,0.0626


In [38]:
item_based_recommender = ItemBasedRecommender(interaction_matrix_csc_binary, item_based_similarity, track_uniques, df_music_info)
item_based_recommender.make_recommendations(user_index, n=20)
item_based_recommender.get_recommendations_info().head(50)

Unnamed: 0,track_id,name,artist,energy
0,TRDJRES128F933B4BA,Quiet Houses,Fleet Foxes,0.668
1,TRKOHVA128F426BF70,Heard Them Stirring,Fleet Foxes,0.47
2,TRQEMRN128F933B4B2,Sun It Rises,Fleet Foxes,0.275
3,TRKABMK128F426BF72,Meadowlarks,Fleet Foxes,0.166
4,TRGEIDA128F933B4B8,Tiger Mountain Peasant Song,Fleet Foxes,0.146
5,TRVRIBZ128F426BF71,Your Protector,Fleet Foxes,0.537
6,TRLXFKL128F426BF75,Oliver James,Fleet Foxes,0.0635
7,TRUJOHU128F424E6A6,White Winter Hymnal,Fleet Foxes,0.497
8,TRSBMHN128F426BF7E,Innocent Son,Fleet Foxes,0.0201
9,TRVCFUI128F92E1D42,Live-In Skin,Foo Fighters,0.967


In [39]:
recommended_song = item_based_recommender.recommend_song(0.4)
df_music_info[df_music_info['track_id'] == recommended_song[0]]

Unnamed: 0,track_id,name,artist,energy
8610,TRBIVWU128F92CA9D2,Skin and Bones,Foo Fighters,0.396


In [40]:
recommended_song = item_based_recommender.recommend_song(0.4)
df_music_info[df_music_info['track_id'] == recommended_song[0]]

Unnamed: 0,track_id,name,artist,energy
1272,TRKOHVA128F426BF70,Heard Them Stirring,Fleet Foxes,0.47


In [41]:
recommended_song = item_based_recommender.recommend_song(0.4)
df_music_info[df_music_info['track_id'] == recommended_song[0]]

Unnamed: 0,track_id,name,artist,energy
5152,TRUJOHU128F424E6A6,White Winter Hymnal,Fleet Foxes,0.497


## Experiments

#### Weighted vs binary matrix

We will compare the results with weighted and binary ratings

In [42]:
def compare_lists(list1, list2):
    min_len = min(len(list1), len(list2))

    for i in range(len(list1)):
        if list1[i] != list2[i]:
            print(f"They defer in position {i}")
            break
    else:
        print("They are the same for all positions up to the minimum length.")


#Random subset of users
random_generator = np.random.default_rng(seed=RANDOM_STATE)
random_users = random_generator.choice(np.arange(len(user_uniques)), size=20, replace=False)

first_different_recommendation = []

for rand_user in random_users:
    item_based_recommender = ItemBasedRecommender(interaction_matrix_csc_weighted, item_based_similarity, track_uniques, df_music_info)
    item_based_recommender.make_recommendations(rand_user, n=200)
    weighted_recommendations = item_based_recommender.get_recommendations_ids()

    item_based_recommender = ItemBasedRecommender(interaction_matrix_csc_binary, item_based_similarity, track_uniques, df_music_info)
    item_based_recommender.make_recommendations(rand_user, n=200)
    binary_recommendations = item_based_recommender.get_recommendations_ids()

    for i in range(len(weighted_recommendations)):
        if weighted_recommendations[i] != binary_recommendations[i]:
            first_different_recommendation.append(i)
            break
    else:
        first_different_recommendation.append(-1)

print("First different recommendation for each user: (-1 = every recommendation is the same)")
print(first_different_recommendation)

First different recommendation for each user: (-1 = every recommendation is the same)
[0, 0, 0, 0, 2, 8, -1, 0, 3, 11, 0, 0, 0, 0, 0, 1, 2, 2, 3, 8]


We can see that recommendations defer in most of the cases. We will analyse a case where the recommendations are not the same

In [43]:
analized_user = random_users[0]

In [44]:
item_based_recommender = ItemBasedRecommender(interaction_matrix_csc_weighted, item_based_similarity, track_uniques, df_music_info)
item_based_recommender.make_recommendations(analized_user, n=200)
weighted_recommendations = item_based_recommender.get_recommendations_info()

item_based_recommender = ItemBasedRecommender(interaction_matrix_csc_binary, item_based_similarity, track_uniques, df_music_info)
item_based_recommender.make_recommendations(analized_user, n=200)
binary_recommendations = item_based_recommender.get_recommendations_info()

In [45]:
weighted_recommendations.head(20)

Unnamed: 0,track_id,name,artist,energy
0,TRDGUPO128F9332CF6,Down To Earth,Bungle,0.904
1,TRVSJOM12903CD2DC1,One Less Lonely Girl,Justin Bieber,0.75
2,TRYEGSH12903CD2DCE,Overboard,Justin Bieber,0.79
3,TRZEDRT12903CD2DCC,Runaway Love,Justin Bieber,0.633
4,TRCKWGF12903CD2DCD,Never Let You Go,Third Eye Blind,0.94
5,TRTKLFX12903CD2DC2,First Dance,Justin Bieber,0.52
6,TRZYAGJ128F9332CEF,Favorite Girl,Justin Bieber,0.681
7,TRLVQME128F931BAF3,Vanilla Twilight,Owl City,0.523
8,TRIQIAF128E0788692,Try a Little Tenderness,Michael Bublé,0.245
9,TRCPXID128F92D5D3C,Halo,Depeche Mode,0.508


In [46]:
binary_recommendations.head(20)

Unnamed: 0,track_id,name,artist,energy
0,TRVSJOM12903CD2DC1,One Less Lonely Girl,Justin Bieber,0.75
1,TRDGUPO128F9332CF6,Down To Earth,Bungle,0.904
2,TRYEGSH12903CD2DCE,Overboard,Justin Bieber,0.79
3,TRZEDRT12903CD2DCC,Runaway Love,Justin Bieber,0.633
4,TRCKWGF12903CD2DCD,Never Let You Go,Third Eye Blind,0.94
5,TRIQIAF128E0788692,Try a Little Tenderness,Michael Bublé,0.245
6,TRLVQME128F931BAF3,Vanilla Twilight,Owl City,0.523
7,TRTKLFX12903CD2DC2,First Dance,Justin Bieber,0.52
8,TRZYAGJ128F9332CEF,Favorite Girl,Justin Bieber,0.681
9,TRLNFKN128F931BAF2,The Tip Of The Iceberg,Owl City,0.772


We can see that even the order is not the same, most of the items are. Now we will calculate how many of the items recomended with the weighted ratings are recommended using the binary ratings

In [47]:
weighted_ids = set(weighted_recommendations['track_id'].head(50))
binary_ids = set(binary_recommendations['track_id'].head(50))
porcentaje = len(weighted_ids & binary_ids) / len(weighted_ids) * 100
print(f"Percentage of wieghted recommendations that are in binary recomendations: {porcentaje:.2f}%")

Percentage of wieghted recommendations that are in binary recomendations: 96.00%


96% of the items are the same in the first 50 recommendations of both systems. So, for this user, the use of weighted or binary rating system does not seem to have much impact. We will study the overlap of the other users of the subset

In [48]:
percentage_overlap_20 = []
percentage_overlap_50 = []
percentage_overlap_200 = []

for rand_user in random_users:
    item_based_recommender = ItemBasedRecommender(interaction_matrix_csc_weighted, item_based_similarity, track_uniques, df_music_info)
    item_based_recommender.make_recommendations(rand_user, n=200)
    weighted_recommendations = item_based_recommender.get_recommendations_info()

    item_based_recommender = ItemBasedRecommender(interaction_matrix_csc_binary, item_based_similarity, track_uniques, df_music_info)
    item_based_recommender.make_recommendations(rand_user, n=200)
    binary_recommendations = item_based_recommender.get_recommendations_info()

    weighted_ids_20 = set(weighted_recommendations['track_id'].head(20))
    binary_ids_20 = set(binary_recommendations['track_id'].head(20))

    weighted_ids_50 = set(weighted_recommendations['track_id'].head(50))
    binary_ids_50 = set(binary_recommendations['track_id'].head(50))

    weighted_ids_200 = set(weighted_recommendations['track_id'])
    binary_ids_200 = set(binary_recommendations['track_id'])


    overlap = round(len(weighted_ids_20 & binary_ids_20) / len(weighted_ids_20) * 100, 2)
    percentage_overlap_20.append(overlap)

    overlap = round(len(weighted_ids_50 & binary_ids_50) / len(weighted_ids_50) * 100, 2)
    percentage_overlap_50.append(overlap)

    overlap = round(len(weighted_ids_200 & binary_ids_200) / len(weighted_ids_200) * 100, 2)
    percentage_overlap_200.append(overlap)

print("Percentage of overlap between weighted and binary recommendations for each user:")
print(f"First 20 recommendations: {percentage_overlap_20}")
print(f"First 50 recommendations: {percentage_overlap_50}")
print(f"First 200 recommendations: {percentage_overlap_200}")

print(f"Average percentage of overlap for 20 first recommendations: {np.mean(percentage_overlap_20):.2f}%")
print(f"Median percentage of overlap for 20 first recommendations: {np.median(percentage_overlap_20):.2f}%")

print(f"Average percentage of overlap for 50 first recommendations: {np.mean(percentage_overlap_50):.2f}%")
print(f"Median percentage of overlap for 50 first recommendations: {np.median(percentage_overlap_50):.2f}%")

print(f"Average percentage of overlap for 200 first recommendations: {np.mean(percentage_overlap_200):.2f}%")
print(f"Median percentage of overlap for 200 first recommendations: {np.median(percentage_overlap_200):.2f}%")

Percentage of overlap between weighted and binary recommendations for each user:
First 20 recommendations: [95.0, 50.0, 70.0, 25.0, 65.0, 100.0, 100.0, 40.0, 60.0, 95.0, 85.0, 60.0, 65.0, 45.0, 40.0, 70.0, 95.0, 75.0, 95.0, 100.0]
First 50 recommendations: [96.0, 58.0, 72.0, 72.0, 94.0, 88.0, 100.0, 66.0, 74.0, 100.0, 58.0, 82.0, 80.0, 86.0, 72.0, 74.0, 78.0, 96.0, 86.0, 86.0]
First 200 recommendations: [95.5, 69.0, 80.0, 78.5, 89.0, 90.5, 100.0, 73.5, 93.0, 97.5, 85.5, 72.5, 78.0, 88.0, 55.5, 80.5, 75.0, 95.0, 80.5, 87.5]
Average percentage of overlap for 20 first recommendations: 71.50%
Median percentage of overlap for 20 first recommendations: 70.00%
Average percentage of overlap for 50 first recommendations: 80.90%
Median percentage of overlap for 50 first recommendations: 81.00%
Average percentage of overlap for 200 first recommendations: 83.22%
Median percentage of overlap for 200 first recommendations: 83.00%


We can see that there are not much differences in most of the cases using the weighted or binary rating method.

We will use a validation set to evaluate the performance of both matrices, using MAP@20 and NDCG@20 metrics

In [32]:
from implicit.evaluation import leave_k_out_split
from scipy.sparse import lil_matrix

In [33]:
def build_split_from_indices(matrix, test_users, test_items):
    train_lil = matrix.tolil()
    test_lil = lil_matrix(matrix.shape)

   
    for u, i in zip(test_users, test_items):
        val = matrix[u, i]
        train_lil[u, i] = 0
        test_lil[u, i] = val

    
    train = train_lil.tocsr()
    test = test_lil.tocsr()
    return train, test

def generate_2_splits(interaction_matrix_1, interaction_matrix_2, k=1, random_state=RANDOM_STATE):
    train_matrix_1, test_matrix_1 = leave_k_out_split(interaction_matrix_1, K = k, random_state=random_state)
    test_users, test_items = test_matrix_1.nonzero()
    train_matrix_2, test_matrix_2 = build_split_from_indices(interaction_matrix_2, test_users, test_items)
    return train_matrix_1, test_matrix_1, train_matrix_2, test_matrix_2

Comparation metrics

In [37]:
def apk(real, predicted, k=20):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    hits = 0.0

    for i, p in enumerate(predicted):
        if p in real and p not in predicted[:i]:  # avoid duplicate hits
            hits += 1.0
            score += hits / (i + 1.0)

    return score / min(len(real), k) if real else 0.0


def mapk(actual_list, predicted_list, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual_list, predicted_list)])

In [38]:
def dcg(relevance_scores, k):
    relevance_scores = np.asarray(relevance_scores, dtype=float)[:k]
    if relevance_scores.size:
        return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.0


def ndcg(actual, predicted, k=10):
    predicted = predicted[:k]
    relevance_scores = [1 if p in actual else 0 for p in predicted]
    ideal_scores = sorted(relevance_scores, reverse=True)

    actual_dcg = dcg(relevance_scores, k)
    ideal_dcg = dcg(ideal_scores, k)

    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0.0


def mean_ndcg(actual_list, predicted_list, k=10):
    return np.mean([ndcg(a, p, k) for a, p in zip(actual_list, predicted_list)])


In [None]:
train_matrix_binary, test_matrix_binary, train_matrix_weighted, test_matrix_weighted = generate_2_splits(interaction_matrix_csc_binary, interaction_matrix_csc_weighted, k=1)

In [35]:
similarity_matrix_binary = cosine_similarity(train_matrix_binary.T, dense_output=False)
similarity_matrix_binary = csc_matrix(similarity_matrix_binary)

similarity_matrix_weighted = cosine_similarity(train_matrix_weighted.T, dense_output=False)
similarity_matrix_weighted = csc_matrix(similarity_matrix_weighted)

item_based_recommender_binary = ItemBasedRecommender(train_matrix_binary, similarity_matrix_binary, track_uniques, df_music_info)
item_based_recommender_weighted = ItemBasedRecommender(train_matrix_weighted, similarity_matrix_weighted, track_uniques, df_music_info)

In [49]:
rng = np.random.RandomState(RANDOM_STATE)
user_indices = rng.choice(train_matrix_binary.shape[0], size=num_distinct_users, replace=False)

In [50]:
real_items = []
predicted_items_binary = []
predicted_items_weighted = []


for user_index in tqdm(user_indices):
    real_indexes = test_matrix_binary[user_index].nonzero()[1]
    real_items.append(set(track_uniques[real_indexes]))

    item_based_recommender_binary.make_recommendations(user_index, n=20)
    predicted_items_binary.append(item_based_recommender_binary.get_recommendations_ids())

    item_based_recommender_weighted.make_recommendations(user_index, n=20)
    predicted_items_weighted.append(item_based_recommender_weighted.get_recommendations_ids())

100%|██████████| 464573/464573 [6:01:37<00:00, 21.41it/s]   


In [51]:
mapk_binary = mapk(real_items, predicted_items_binary, k=20)
mapk_weighted = mapk(real_items, predicted_items_weighted, k=20)

ndcg_binary = mean_ndcg(real_items, predicted_items_binary, k=20)
ndcg_weighted = mean_ndcg(real_items, predicted_items_weighted, k=20)

In [52]:
results_df = pd.DataFrame({
    'Modelo': ['Binary', 'Weighted'],
    'MAP@20': [mapk_binary, mapk_weighted],
    'NDCG@20': [ndcg_binary, ndcg_weighted]
})

results_df

Unnamed: 0,Modelo,MAP@20,NDCG@20
0,Binary,0.173308,0.217894
1,Weighted,0.157009,0.199702


In [53]:
results_df = pd.DataFrame({
    'Matrix': ['Binary', 'Weighted'],
    'MAP@20': [mapk_binary, mapk_weighted],
    'NDCG@20': [ndcg_binary, ndcg_weighted]
})

results_df

Unnamed: 0,Matrix,MAP@20,NDCG@20
0,Binary,0.173308,0.217894
1,Weighted,0.157009,0.199702
