# 8. Hybrid system

## Data loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import pickle
from implicit.evaluation import leave_k_out_split
RANDOM_STATE = 123

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#df_users = pd.read_csv('data/train_triplets.txt', sep='\t', header=None, names=['user_id', 'song_id', 'play_count'])
df_users = pd.read_csv('data/User Listening History_modified.csv')

In [3]:
df_music = pd.read_csv('data/Million Song Dataset kaggle/Music Info.csv')

In [4]:
df_users.shape

(8332242, 3)

In [5]:
num_distinct_users = df_users['user_id'].nunique()
num_distinct_users

464573

In [6]:
df_music.head()

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [7]:
df_music_info = df_music[['track_id', 'name', 'artist', 'energy']]
df_music_info.head()

Unnamed: 0,track_id,name,artist,energy
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,0.918
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,0.892
2,TROUVHL128F426C441,Come as You Are,Nirvana,0.826
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,0.664
4,TRLNZBD128F935E4D8,Creep,Radiohead,0.43


We load the asigned cluster for each song previously calculated

In [8]:
id_to_cluster = pd.read_csv('data/track_clusters.csv', index_col=0).iloc[:, 0]

In [9]:
id_to_cluster

track_id
TRIOREW128F424EAF0    4
TRRIVDJ128F429B0E8    4
TROUVHL128F426C441    3
TRUEIND128F93038C4    2
TRLNZBD128F935E4D8    2
                     ..
TRQYCFV128F9322F50    3
TRHQCSH128F42724B7    4
TRZRODK128F92D68D7    0
TRGLMEM128F9322F63    3
TRIPFKO128F42383FE    2
Name: 0, Length: 50683, dtype: int64

## Interaction Matrix

In [10]:
df_users_agg = df_users.groupby('user_id')['playcount'].agg(
    #total_playcount='sum',
    max_playcount='max'
).reset_index()
df_users_agg = df_users_agg.rename(columns={'playcount': 'max_playcount'})

In [11]:
df_users_rating = df_users.merge(df_users_agg, on='user_id')

In [12]:
from scipy.sparse import csr_matrix

We codify each user and song to a unique integer ID

In [13]:
user_codes, user_uniques = pd.factorize(df_users['user_id'])
track_codes, track_uniques = pd.factorize(df_users['track_id'])

In [14]:
from implicit.nearest_neighbours import bm25_weight

interaction_matrix_user_item_original = csr_matrix(
    (df_users_rating['playcount'], (user_codes, track_codes)),
    shape=(len(user_uniques), len(track_uniques))
)

interaction_matrix_user_item = bm25_weight(interaction_matrix_user_item_original, K1=1.2, B=0.75).tocsr()

## Matrix Factorization: Alternating Least Squares (ALS)

In [15]:
from implicit.als import AlternatingLeastSquares

In [16]:
class ALSRecommender:
    def __init__(self, interaction_matrix, track_uniques, df_music_info, als_model=None):
        self.interaction_matrix = interaction_matrix
        self.track_uniques = track_uniques
        self.df_music_info = df_music_info

        if als_model is None:
            self.als_model = AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20, num_threads=-1, random_state=RANDOM_STATE)
            self.als_model.fit(self.interaction_matrix)
        else:
            self.als_model = als_model

        self.user_index = None
        self.recommendations = None # List of tuples (track_id, energy, similarity, has been recommended)

    def make_recommendations(self, user_index, n=100):
        self.user_index = user_index

        user_items = self.interaction_matrix.tocsr()[user_index]


        top_n_recommendations_indexes, top_n_recommendations_scores = self.als_model.recommend(user_index, user_items, N=n, filter_already_liked_items=True)

        # for i in range(len(top_n_recommendations_indexes)):
        #     print(f"Track ID: {self.track_uniques[top_n_recommendations_indexes[i]]}, Similarity: {top_n_recommendations_scores[i]}")


        track_ids = self.track_uniques[top_n_recommendations_indexes].tolist()
        
        df_filtered = self.df_music_info.set_index('track_id').loc[track_ids][['energy']].reset_index()

        self.recommendations = [(track_id, energy, similarity, False) for (track_id, energy), similarity in zip(df_filtered.itertuples(index=False, name=None), top_n_recommendations_scores)]
        return self.recommendations

    
    def recommend_song(self, energy, energy_margin=0.05):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        
        closest_track_index = None
        distance_to_energy = float('inf')

        for i, (track_id, track_energy, similarity, has_been_recommended) in enumerate(self.recommendations):
            distance = abs(track_energy - energy)

            if not has_been_recommended and distance <= energy_margin:
                self.recommendations[i] = (track_id, track_energy, similarity, True)
                return (track_id, track_energy)
            
            if not has_been_recommended and distance < distance_to_energy:
                closest_track_index = i
                distance_to_energy = distance
        
        if closest_track_index is not None:
            track_id, track_energy, _, _= self.recommendations[closest_track_index]
            self.recommendations[closest_track_index] = (track_id, track_energy, similarity, True)
            return (track_id, track_energy)

        raise ValueError("All recommendations have already been recommended")


    def get_recommendations(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return self.recommendations


    def get_recommendations_ids(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return [track_id for track_id, _, _, _ in self.recommendations]
    
    def get_recommendations_info(self):
        track_ids = [track_id for track_id, _, _, _ in self.recommendations]
        df_ordered = self.df_music_info.set_index('track_id').loc[track_ids].reset_index()
        return df_ordered

## Kmeans content-based filtering

In [17]:
class KmeansContentBasedRecommender:
    def __init__(self, id_to_cluster):
        self.id_to_cluster = id_to_cluster
        self.recommendations = None
    
    def make_cluster_recommendation(self, user_history):
        clusters = self.id_to_cluster[user_history]
        cluster_counts = clusters.value_counts()
        self.recommended_cluster = cluster_counts / len(clusters)
        return self.recommended_cluster

    def get_recommended_cluster(self):
        if self.recommended_cluster is None:
            raise ValueError("No cluster recommendation available. Please call make_cluster_recommendation first.")
        return self.recommendations
    

## Hybrid recommender

In [18]:
class HybridRecommender:
    def __init__(self, interaction_matrix, track_uniques, df_music_info, df_users, id_to_cluster, als_recommender = None, content_based_recommender = None, alpha = 2):
        if als_recommender is not None:
            self.collaborative_als_recommender = als_recommender
        else:
            self.collaborative_als_recommender = ALSRecommender(interaction_matrix, track_uniques, df_music_info)
        
        if content_based_recommender is not None:
            self.content_based_recommender = content_based_recommender  
        else:
            self.content_based_recommender = KmeansContentBasedRecommender(id_to_cluster)

        self.df_music_info = df_music_info
        self.df_users = df_users
        self.id_to_cluster = id_to_cluster
        self.alpha = alpha  # Alpha is a parameter to control the influence of content-based recommendations
        self.recommendations = None # List of tuples (track_id, energy, similarity, has been recommended)

    
    def make_recommendations(self, user_index, n=100):

        user_id = self.df_users['user_id'].unique()[user_index]
        user_history = self.df_users[self.df_users['user_id'] == user_id]['track_id']
        collaborative_recomendations = self.collaborative_als_recommender.make_recommendations(user_index, n)
        content_based_cluster_recommendation = self.content_based_recommender.make_cluster_recommendation(user_history)
        self.recommendations = []
        
        #We will apply a penalization to the collaborative filtering recommendation based on the user cluster preferences obtained by the content-based recommendation
        for track_id, energy, similarity, has_been_recommended in collaborative_recomendations:
            cluster_presence = 0 #Default multiplier. Used if the song's cluster is not in the user's cluster preferences (content-based recommendation)
            song_cluster = self.id_to_cluster[track_id]
            if song_cluster in content_based_cluster_recommendation.index:
                cluster_presence = content_based_cluster_recommendation[song_cluster]
            
            #print(track_id, song_cluster, multiplier)

            self.recommendations.append((track_id, energy, similarity + cluster_presence * self.alpha, has_been_recommended)) # confidence = colab_conficence + cluster_presence * self.alpha
        self.recommendations = sorted(self.recommendations, key=lambda x: x[2], reverse=True)  # Sort new similarity


    def make_recommendations_only_collaborative(self, user_index, n=100):
        self.recommendations = self.collaborative_als_recommender.make_recommendations(user_index, n)
    
    def recommend_song(self, energy, energy_margin=0.05):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        
        closest_track_index = None
        distance_to_energy = float('inf')

        for i, (track_id, track_energy, similarity, has_been_recommended) in enumerate(self.recommendations):
            distance = abs(track_energy - energy)

            if not has_been_recommended and distance <= energy_margin:
                self.recommendations[i] = (track_id, track_energy, similarity, True)
                return (track_id, track_energy)
            
            if not has_been_recommended and distance < distance_to_energy:
                closest_track_index = i
                distance_to_energy = distance
        
        if closest_track_index is not None:
            track_id, track_energy, _, _= self.recommendations[closest_track_index]
            self.recommendations[closest_track_index] = (track_id, track_energy, similarity, True)
            return (track_id, track_energy)
    
    def get_recommendations(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return self.recommendations
    
    def get_recommendations_ids(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return [track_id for track_id, _, _, _ in self.recommendations]
    
    def get_recommendations_info(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        track_ids = [track_id for track_id, _, _, _ in self.recommendations]
        df_ordered = self.df_music_info.set_index('track_id').loc[track_ids].reset_index()
        return df_ordered

### Example

In [19]:
user_index = 0 #User for wich recommendations will be made
user_listened_songs = df_users[df_users['user_id'] == user_uniques[user_index]].track_id
df_music_info[df_music_info['track_id'].isin(user_listened_songs)]

Unnamed: 0,track_id,name,artist,energy
79,TRAAHSY128F147BB5C,Speed of Sound,Coldplay,0.898
796,TRDLMWP128F426BF6C,Ragged Wood,Fleet Foxes,0.685
1158,TRPUGUW128F426BF6F,He Doesn't Know Why,Fleet Foxes,0.558
1743,TRRYCBO128F932A2C7,Love Shack,The B-52's,0.902
2350,TRFUPBA128F934F7E1,Paper Gangsta,Lady Gaga,0.53
2585,TRVODUZ128F934D094,All That We Perceive,Thievery Corporation,0.834
2722,TRHDDQG12903CB53EE,Heaven's Gonna Burn Your Eyes,Thievery Corporation,0.337
5416,TRTUCUY128F92E1D24,Stacked Actors,Foo Fighters,0.934
5428,TRLXSNR128F429361D,Generator,Foo Fighters,0.924
5819,TRADVZX128F426BF79,Sun Giant,Fleet Foxes,0.0626


In [20]:
# als_model = AlternatingLeastSquares(factors=600, regularization=1, iterations=20, num_threads=1, random_state=RANDOM_STATE)
# als_model.fit(interaction_matrix_user_item)

# with open('models/als_model.pkl', 'wb') as f:
#     pickle.dump(als_model, f)

In [21]:
# Load the pre-trained ALS model
with open('models/als_model.pkl', 'rb') as f:
    als_model = pickle.load(f)

In [22]:
als_recommender = ALSRecommender(interaction_matrix_user_item, track_uniques, df_music_info, als_model=als_model)

In [23]:
hybrid_recommender = HybridRecommender(interaction_matrix_user_item, track_uniques, df_music_info, df_users, id_to_cluster, als_recommender=als_recommender)
hybrid_recommender.make_recommendations_only_collaborative(user_index, n=100)
hybrid_recommender.get_recommendations_info().head(20)

Unnamed: 0,track_id,name,artist,energy
0,TRUJOHU128F424E6A6,White Winter Hymnal,Fleet Foxes,0.497
1,TRGEIDA128F933B4B8,Tiger Mountain Peasant Song,Fleet Foxes,0.146
2,TRGYEZE128F92F52B2,Great Indoors,John Mayer,0.617
3,TRDJRES128F933B4BA,Quiet Houses,Fleet Foxes,0.668
4,TRVRIBZ128F426BF71,Your Protector,Fleet Foxes,0.537
5,TRSBMHN128F426BF7E,Innocent Son,Fleet Foxes,0.0201
6,TRKABMK128F426BF72,Meadowlarks,Fleet Foxes,0.166
7,TRQEMRN128F933B4B2,Sun It Rises,Fleet Foxes,0.275
8,TRKOHVA128F426BF70,Heard Them Stirring,Fleet Foxes,0.47
9,TRORBHC128F42AD0F4,No Way Back,Foo Fighters,0.957


In [24]:
hybrid_recommender.make_recommendations(user_index, n=100)
hybrid_recommender.get_recommendations_info().head(20)

Unnamed: 0,track_id,name,artist,energy
0,TRUJOHU128F424E6A6,White Winter Hymnal,Fleet Foxes,0.497
1,TRGYEZE128F92F52B2,Great Indoors,John Mayer,0.617
2,TRLZYYR128F42671E1,Pida Me La,Gipsy Kings,0.949
3,TRLPCKY128F42ACFB8,DOA,Foo Fighters,0.914
4,TRVCFUI128F92E1D42,Live-In Skin,Foo Fighters,0.967
5,TRGEIDA128F933B4B8,Tiger Mountain Peasant Song,Fleet Foxes,0.146
6,TRSLJOC128F42889D5,The Sharing Song,Jack Johnson,0.427
7,TRDBQXT128F4289423,Wattershed,Foo Fighters,0.967
8,TRKYRGV128F428941E,Weenie Beenie,Foo Fighters,0.993
9,TRJDTBE128F4289421,For All the Cows,Foo Fighters,0.542


### Evaluation

In [25]:
def apk(real, predicted, k=20):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    hits = 0.0

    for i, p in enumerate(predicted):
        if p in real and p not in predicted[:i]:  # avoid duplicate hits
            hits += 1.0
            score += hits / (i + 1.0)

    return score / min(len(real), k) if real else 0.0


def mapk(actual_list, predicted_list, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual_list, predicted_list)])

def dcg(relevance_scores, k):
    relevance_scores = np.asarray(relevance_scores, dtype=float)[:k]
    if relevance_scores.size:
        return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.0


def ndcg(actual, predicted, k=10):
    predicted = predicted[:k]
    relevance_scores = [1 if p in actual else 0 for p in predicted]
    ideal_scores = sorted(relevance_scores, reverse=True)

    actual_dcg = dcg(relevance_scores, k)
    ideal_dcg = dcg(ideal_scores, k)

    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0.0


def mean_ndcg(actual_list, predicted_list, k=10):
    return np.mean([ndcg(a, p, k) for a, p in zip(actual_list, predicted_list)])


In [26]:
# train_matrix, test_matrix = leave_k_out_split(interaction_matrix_user_item, K=1, random_state=RANDOM_STATE)

In [27]:
# als_model_ev = AlternatingLeastSquares(factors=10, regularization=1, iterations=20, alpha=9, num_threads=1, random_state=RANDOM_STATE)
# als_model_ev.fit(train_matrix)

In [28]:
with open('matrices/train_matrix_als_final_prueba2.pkl', 'rb') as f:
    train_matrix = pickle.load(f)

with open('matrices/test_matrix_als_final_prueba2.pkl', 'rb') as f:
    test_matrix = pickle.load(f)

In [29]:
with open('models/als_comparation_600_1_9_prueba2.pkl', 'rb') as f:
    als_model_ev = pickle.load(f)

We evaluate the system recommending 20 songs and then calculating MAP@20 and NDCG@20.

In [34]:
rng = np.random.RandomState(RANDOM_STATE)
user_indices = rng.choice(train_matrix.shape[0], size=4000, replace=False) #  size=num_users

In [35]:
als_recommender_ev = ALSRecommender(train_matrix, track_uniques, df_music_info, als_model=als_model_ev)
hybrid_recommender_ev = HybridRecommender(train_matrix, track_uniques, df_music_info, df_users, id_to_cluster, als_recommender=als_recommender_ev)

real_items = []
predicted_items_als = []
predicted_items_hybrid = []

for user_index in tqdm(user_indices):
    real_indexes = test_matrix[user_index].nonzero()[1]
    real_items.append(set(track_uniques[real_indexes]))

    hybrid_recommender_ev.make_recommendations_only_collaborative(user_index, n=20)
    predicted_items_als.append(hybrid_recommender_ev.get_recommendations_ids())

    hybrid_recommender_ev.make_recommendations(user_index, n=20)
    predicted_items_hybrid.append(hybrid_recommender_ev.get_recommendations_ids())

100%|██████████| 4000/4000 [1:26:46<00:00,  1.30s/it]


In [36]:
mapk_als = mapk(real_items, predicted_items_als, k=20)
ndcg_als = mean_ndcg(real_items, predicted_items_als, k=20)

mapk_hybrid = mapk(real_items, predicted_items_hybrid, k=20)
ndcg_hybrid = mean_ndcg(real_items, predicted_items_hybrid, k=20)

In [37]:
results_df = pd.DataFrame({
    'Model': ['ALS', 'Hybrid'],
    'MAP@20': [mapk_als, mapk_hybrid],
    'NDCG@20': [ndcg_als, ndcg_hybrid]
})

results_df

Unnamed: 0,Model,MAP@20,NDCG@20
0,ALS,0.179521,0.226388
1,Hybrid,0.167891,0.217251


As we evaluated MAP@20 and NDCG@20 for 20 recommendations of each system, the 20 recommended songs will be the same, but in different order. Because of that, we will now make 50 recommendations with each model and evaluate MAP@20 and NDCG@20 for them. By doing that, the 20 first recommended songs can be different.

In [30]:
rng = np.random.RandomState(RANDOM_STATE + 1)
user_indices = rng.choice(train_matrix.shape[0], size=4000, replace=False)

In [31]:
als_recommender_ev = ALSRecommender(train_matrix, track_uniques, df_music_info, als_model=als_model_ev)
hybrid_recommender_ev = HybridRecommender(train_matrix, track_uniques, df_music_info, df_users, id_to_cluster, als_recommender=als_recommender_ev)

real_items = []
predicted_items_als = []
predicted_items_hybrid = []

for user_index in tqdm(user_indices):
    real_indexes = test_matrix[user_index].nonzero()[1]
    real_items.append(set(track_uniques[real_indexes]))

    hybrid_recommender_ev.make_recommendations_only_collaborative(user_index, n=50)
    predicted_items_als.append(hybrid_recommender_ev.get_recommendations_ids())

    hybrid_recommender_ev.make_recommendations(user_index, n=50)
    predicted_items_hybrid.append(hybrid_recommender_ev.get_recommendations_ids())

100%|██████████| 4000/4000 [1:23:52<00:00,  1.26s/it]


In [32]:
mapk_als = mapk(real_items, predicted_items_als, k=20)
ndcg_als = mean_ndcg(real_items, predicted_items_als, k=20)

mapk_hybrid = mapk(real_items, predicted_items_hybrid, k=20)
ndcg_hybrid = mean_ndcg(real_items, predicted_items_hybrid, k=20)

In [33]:
results_df = pd.DataFrame({
    'Model': ['ALS', 'Hybrid'],
    'MAP@20': [mapk_als, mapk_hybrid],
    'NDCG@20': [ndcg_als, ndcg_hybrid]
})

results_df

Unnamed: 0,Model,MAP@20,NDCG@20
0,ALS,0.182224,0.230703
1,Hybrid,0.170413,0.218951
