# 7. Collaborative models comparation

## Data loading

In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
print(os.environ.get("OPENBLAS_NUM_THREADS"))

1


In [2]:
import pandas as pd
import numpy as np
import math
import random
import pickle
from IPython.display import display
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix, csc_matrix, lil_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import leave_k_out_split
RANDOM_STATE = 123

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_users = pd.read_csv('data/User Listening History_modified.csv')
df_music = pd.read_csv('data/Million Song Dataset kaggle/Music Info.csv')

In [4]:
df_music_info = df_music[['track_id', 'name', 'artist', 'energy', 'duration_ms']]

In [5]:
num_users = df_users['user_id'].nunique()
num_users

464573

## Models

### Item Based

In [6]:
class ItemBasedRecommender:
    def __init__(self, interaction_matrix, item_similarity, track_uniques, df_music_info):
        self.interaction_matrix = interaction_matrix
        self.item_similarity = item_similarity
        self.track_uniques = track_uniques
        self.df_music_info = df_music_info
        self.user_index = None
        self.recommendations = None # List of tuples (track_id, energy, similarity, has been recommended)

    def make_recommendations(self, user_index, n=100):
        self.user_index = user_index
        user_ratings = self.interaction_matrix[self.user_index]
        interacted = user_ratings.nonzero()[1]
        similarities = self.item_similarity.dot(user_ratings.T).toarray().ravel()
        similarities[interacted] = -np.inf
        top_n_index = np.argpartition(similarities, -n)[-n:]
        top_n_index = top_n_index[np.argsort(similarities[top_n_index])[::-1]]

        # for index in top_n_index:
        #     print(f"Track ID: {self.track_uniques[index]}, Similarity: {similarities[index]}")

        track_ids = self.track_uniques[top_n_index].tolist()
        df_filtered = self.df_music_info.set_index('track_id').loc[track_ids][['energy']].reset_index()
        index_to_similarity = {idx: similarities[idx] for idx in top_n_index}

        self.recommendations = [(track_id, energy, index_to_similarity[self.track_uniques.tolist().index(track_id)], False) for track_id, energy in df_filtered.itertuples(index=False, name=None)]

    
    def recommend_song(self, energy, energy_margin=0.05):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        
        closest_track_index = None
        distance_to_energy = float('inf')

        for i, (track_id, track_energy, similarity, has_been_recommended) in enumerate(self.recommendations):
            distance = abs(track_energy - energy)

            if not has_been_recommended and distance <= energy_margin:
                self.recommendations[i] = (track_id, track_energy, similarity, True)
                return (track_id, track_energy)
            
            if not has_been_recommended and distance < distance_to_energy:
                closest_track_index = i
                distance_to_energy = distance
        
        if closest_track_index is not None:
            track_id, track_energy, _, _= self.recommendations[closest_track_index]
            self.recommendations[closest_track_index] = (track_id, track_energy, similarity, True)
            return (track_id, track_energy)

        raise ValueError("All recommendations have already been recommended")


    def get_recommendations(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return self.recommendations


    def get_recommendations_ids(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return [track_id for track_id, _, _, _ in self.recommendations]
    
    def get_recommendations_info(self):
        track_ids_ordered = [track_id for track_id, _, _, _ in self.recommendations]
        df_ordered = self.df_music_info.set_index('track_id').loc[track_ids_ordered].reset_index()
        return df_ordered

### User based

In [7]:
class UserBasedRecommender:
    def __init__(self, interaction_matrix, track_uniques, df_music_info, num_neighbors=10):
        self.interaction_matrix = interaction_matrix
        self.knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=num_neighbors + 1, n_jobs=-1) # num_neighbors+1 because the user itself is included in the neighbors and we will ignore it
        self.knn_model.fit(interaction_matrix)
        self.track_uniques = track_uniques
        self.df_music_info = df_music_info
        self.user_index = None
        self.recommendations = None # List of tuples (track_id, energy, has been recommended)

    def make_recommendations(self, user_index, n=5):
        self.user_index = user_index

        _, neighbors_indices = self.knn_model.kneighbors(self.interaction_matrix[user_index])

        neighbors_indices = neighbors_indices[:, 1:]  # Exclude the first index which is the user itself

        neighbors_interaction_matrix = self.interaction_matrix[neighbors_indices.flatten()]

        neighbors_mean_ratings = neighbors_interaction_matrix.mean(axis=0).A1 # Converts to 1D array

        user_ratings = self.interaction_matrix[user_index]
        interacted = user_ratings.nonzero()[1]
        neighbors_mean_ratings[interacted] = -np.inf #Asign -np.inf to already listened songs by the user

        top_n_index = np.argpartition(neighbors_mean_ratings, -n)[-n:]
        top_n_index = top_n_index[np.argsort(neighbors_mean_ratings[top_n_index])[::-1]]

        # for index in top_n_index:
        #     print(f"Track ID: {self.track_uniques[index]}, Mean Rating: {neighbors_mean_ratings[index]}")
        
        track_ids = self.track_uniques[top_n_index].tolist()

        df_filtered = self.df_music_info.set_index('track_id').loc[track_ids][['energy']].reset_index()

        # print(df_filtered)

        self.recommendations = [(track_id, energy, False) for track_id, energy in df_filtered.itertuples(index=False, name=None)]

    
    def recommend_song(self, energy, energy_margin=0.05):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        
        closest_track_index = None
        distance_to_energy = float('inf')

        for i, (track_id, track_energy, has_been_recommended) in enumerate(self.recommendations):
            distance = abs(track_energy - energy)

            if not has_been_recommended and distance <= energy_margin:
                self.recommendations[i] = (track_id, track_energy, True)
                return (track_id, track_energy)
            
            if not has_been_recommended and distance < distance_to_energy:
                closest_track_index = i
                distance_to_energy = distance
        
        if closest_track_index is not None:
            track_id, track_energy, _ = self.recommendations[closest_track_index]
            self.recommendations[closest_track_index] = (track_id, track_energy, True)
            return (track_id, track_energy)

        raise ValueError("All recommendations have already been recommended")


    def get_recommendations(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return self.recommendations


    def get_recommendations_ids(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return [track_id for track_id, _, _ in self.recommendations]
    
    
    def get_recommendations_info(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        track_ids_ordered = [track_id for track_id, _, _ in self.recommendations]
        df_ordered = self.df_music_info.set_index('track_id').loc[track_ids_ordered].reset_index()
        return df_ordered

### Matrix Factorization: Alternating Least Squares (ALS)

In [8]:
class ALSRecommender:
    def __init__(self, interaction_matrix, track_uniques, df_music_info, als_model=None):
        self.interaction_matrix = interaction_matrix
        self.track_uniques = track_uniques
        self.df_music_info = df_music_info

        if als_model is None:
            self.als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20, num_threads=0, random_state=RANDOM_STATE)
            self.als_model.fit(self.interaction_matrix)
        else:
            self.als_model = als_model

        self.user_index = None
        self.recommendations = None # List of tuples (track_id, energy, similarity, has been recommended)

    def make_recommendations(self, user_index, n=100):
        self.user_index = user_index

        user_items = self.interaction_matrix.tocsr()[user_index]


        top_n_recommendations_indexes, top_n_recommendations_scores = self.als_model.recommend(user_index, user_items, N=n, filter_already_liked_items=True)

        # for i in range(len(top_n_recommendations_indexes)):
        #     print(f"Track ID: {self.track_uniques[top_n_recommendations_indexes[i]]}, Similarity: {top_n_recommendations_scores[i]}")


        track_ids = self.track_uniques[top_n_recommendations_indexes].tolist()
        
        df_filtered = self.df_music_info.set_index('track_id').loc[track_ids][['energy']].reset_index()

        self.recommendations = [(track_id, energy, similarity, False) for (track_id, energy), similarity in zip(df_filtered.itertuples(index=False, name=None), top_n_recommendations_scores)]
        return self.recommendations

    
    def recommend_song(self, energy, energy_margin=0.05):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        
        closest_track_index = None
        distance_to_energy = float('inf')

        for i, (track_id, track_energy, similarity, has_been_recommended) in enumerate(self.recommendations):
            distance = abs(track_energy - energy)

            if not has_been_recommended and distance <= energy_margin:
                self.recommendations[i] = (track_id, track_energy, similarity, True)
                return (track_id, track_energy)
            
            if not has_been_recommended and distance < distance_to_energy:
                closest_track_index = i
                distance_to_energy = distance
        
        if closest_track_index is not None:
            track_id, track_energy, _, _= self.recommendations[closest_track_index]
            self.recommendations[closest_track_index] = (track_id, track_energy, similarity, True)
            return (track_id, track_energy)

        raise ValueError("All recommendations have already been recommended")


    def get_recommendations(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return self.recommendations


    def get_recommendations_ids(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return [track_id for track_id, _, _, _ in self.recommendations]
    
    def get_recommendations_info(self):
        track_ids = [track_id for track_id, _, _, _ in self.recommendations]
        df_ordered = self.df_music_info.set_index('track_id').loc[track_ids].reset_index()
        return df_ordered

## Interaction Matrices

In [9]:
df_users_agg = df_users.groupby('user_id')['playcount'].agg(
    #total_playcount='sum',
    max_playcount='max'
).reset_index()
df_users_agg = df_users_agg.rename(columns={'playcount': 'max_playcount'})

df_users_rating = df_users.merge(df_users_agg, on='user_id')
df_users_rating['rating'] = df_users_rating['playcount'] / df_users_rating['max_playcount']

user_codes, user_uniques = pd.factorize(df_users['user_id'])
track_codes, track_uniques = pd.factorize(df_users['track_id'])

In [10]:
from implicit.nearest_neighbours import bm25_weight

interaction_matrix_user_based = csr_matrix((np.ones(len(df_users)), (user_codes, track_codes)),shape=(len(user_uniques), len(track_uniques)))
interaction_matrix_item_based = csc_matrix((np.ones(len(df_users)), (user_codes, track_codes)),shape=(len(user_uniques), len(track_uniques)))
#interaction_matrix_item_based = csc_matrix((df_users_rating['rating'], (user_codes, track_codes)),shape=(len(user_uniques), len(track_uniques)))

interaction_matrix_user_item_original = csr_matrix(
    (df_users_rating['playcount'], (user_codes, track_codes)),
    shape=(len(user_uniques), len(track_uniques))
)

interaction_matrix_als = bm25_weight(interaction_matrix_user_item_original, K1=1.2, B=0.75).tocsr() #We use K1 and B parameters as default values (K1=100, B=0.8)


## Model Comparations

In [11]:
def apk(real, predicted, k=20):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    hits = 0.0

    for i, p in enumerate(predicted):
        if p in real and p not in predicted[:i]:  # avoid duplicate hits
            hits += 1.0
            score += hits / (i + 1.0)

    return score / min(len(real), k) if real else 0.0


def mapk(actual_list, predicted_list, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual_list, predicted_list)])


In [12]:
def dcg(relevance_scores, k):
    relevance_scores = np.asarray(relevance_scores, dtype=float)[:k]
    if relevance_scores.size:
        return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.0


def ndcg(actual, predicted, k=10):
    predicted = predicted[:k]
    relevance_scores = [1 if p in actual else 0 for p in predicted]
    ideal_scores = sorted(relevance_scores, reverse=True)

    actual_dcg = dcg(relevance_scores, k)
    ideal_dcg = dcg(ideal_scores, k)

    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0.0


def mean_ndcg(actual_list, predicted_list, k=10):
    return np.mean([ndcg(a, p, k) for a, p in zip(actual_list, predicted_list)])


In [13]:
def build_split_from_indices(matrix, test_users, test_items):
    train_lil = matrix.tolil()
    test_lil = lil_matrix(matrix.shape)

   
    for u, i in zip(test_users, test_items):
        val = matrix[u, i]
        train_lil[u, i] = 0
        test_lil[u, i] = val

    
    train = train_lil.tocsr()
    test = test_lil.tocsr()
    return train, test

def generate_3_splits(interaction_matrix_1, interaction_matrix_2, interaction_matrix_3, k=1, random_state=RANDOM_STATE):
    train_matrix_1, test_matrix_1 = leave_k_out_split(interaction_matrix_1, K = k, random_state=random_state)
    test_users, test_items = test_matrix_1.nonzero()
    train_matrix_2, test_matrix_2 = build_split_from_indices(interaction_matrix_2, test_users, test_items)
    train_matrix_3, test_matrix_3 = build_split_from_indices(interaction_matrix_3, test_users, test_items)
    return train_matrix_1, test_matrix_1, train_matrix_2, test_matrix_2, train_matrix_3, test_matrix_3

def generate_2_splits_form_split(train_matrix_1, test_matrix_1, interaction_matrix_2, interaction_matrix_3, k=1, random_state=RANDOM_STATE):
    test_users, test_items = test_matrix_1.nonzero()
    train_matrix_2, test_matrix_2 = build_split_from_indices(interaction_matrix_2, test_users, test_items)
    train_matrix_3, test_matrix_3 = build_split_from_indices(interaction_matrix_3, test_users, test_items)
    return train_matrix_2, test_matrix_2, train_matrix_3, test_matrix_3

In [22]:
train_matrix_als, test_matrix_als, train_matrix_item_based, test_matrix_item_based, train_matrix_user_based, test_matrix_user_based = generate_3_splits(interaction_matrix_als, interaction_matrix_item_based, interaction_matrix_user_based, k=1, random_state=RANDOM_STATE)

In [23]:
with open('matrices/train_matrix_als_final_prueba2.pkl', 'wb') as f:
    pickle.dump(train_matrix_als, f)

with open('matrices/test_matrix_als_final_prueba2.pkl', 'wb') as f:
    pickle.dump(test_matrix_als, f)

with open('matrices/train_matrix_item_based_final_prueba2.pkl', 'wb') as f:
    pickle.dump(train_matrix_item_based, f)

with open('matrices/test_matrix_item_based_final_prueba2.pkl', 'wb') as f:
    pickle.dump(test_matrix_item_based, f)

with open('matrices/train_matrix_user_based_final_prueba2.pkl', 'wb') as f:
    pickle.dump(train_matrix_user_based, f)

with open('matrices/test_matrix_user_based_final_prueba2.pkl', 'wb') as f:
    pickle.dump(test_matrix_user_based, f)

In [24]:
als_model = AlternatingLeastSquares(factors=600, regularization=1, iterations=20, alpha=9, num_threads=1, random_state=RANDOM_STATE)
als_model.fit(train_matrix_als)

100%|██████████| 20/20 [14:07<00:00, 42.37s/it]


In [25]:
with open('models/als_comparation_600_1_9_prueba2.pkl', 'wb') as f:
    pickle.dump(als_model, f)

In [26]:
item_based_similarity = cosine_similarity(train_matrix_item_based.T, dense_output=False)
item_based_similarity = csc_matrix(item_based_similarity)


item_based_recommender = ItemBasedRecommender(train_matrix_item_based, item_based_similarity, track_uniques, df_music_info)
user_based_recommender = UserBasedRecommender(train_matrix_user_based, track_uniques, df_music_info, num_neighbors=160)
als_recommender = ALSRecommender(train_matrix_als, track_uniques, df_music_info, als_model=als_model)

In [27]:
rng = np.random.RandomState(RANDOM_STATE)
user_indices = rng.choice(train_matrix_als.shape[0], size=40000, replace=False) #  size=num_users

In [None]:
real_items = []
predicted_items_item_based = []
predicted_items_user_based = []
predicted_items_als = []


for user_index in tqdm(user_indices):
    real_indexes = test_matrix_als[user_index].nonzero()[1]
    real_items.append(set(track_uniques[real_indexes]))

    item_based_recommender.make_recommendations(user_index, n=20)
    predicted_items_item_based.append(item_based_recommender.get_recommendations_ids())

    user_based_recommender.make_recommendations(user_index, n=20)
    predicted_items_user_based.append(user_based_recommender.get_recommendations_ids())

    als_recommender.make_recommendations(user_index, n=20)
    predicted_items_als.append(als_recommender.get_recommendations_ids())

100%|██████████| 40000/40000 [1:40:22<00:00,  6.64it/s]


In [29]:
mapk_item_based = mapk(real_items, predicted_items_item_based, k=20)
mapk_user_based = mapk(real_items, predicted_items_user_based, k=20)
mapk_als = mapk(real_items, predicted_items_als, k=20)

ndcg_item_based = mean_ndcg(real_items, predicted_items_item_based, k=20)
ndcg_user_based = mean_ndcg(real_items, predicted_items_user_based, k=20)
ndcg_als = mean_ndcg(real_items, predicted_items_als, k=20)

In [30]:
results_df = pd.DataFrame({
    'Model': ['Item-Based', 'User-Based', 'ALS'],
    'MAP@20': [mapk_item_based, mapk_user_based, mapk_als],
    'NDCG@20': [ndcg_item_based, ndcg_user_based, ndcg_als]
})

results_df

Unnamed: 0,Model,MAP@20,NDCG@20
0,Item-Based,0.175067,0.21975
1,User-Based,0.171706,0.210334
2,ALS,0.180535,0.228965
