# Hybrid system

- Coger subsets aleatorios
- Ver si usar item based collab porque tiene mas escalabilidad, user based necesita calcular la similitud de todos los pares de usuarios

## Data loading

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
RANDOM_STATE = 123

In [2]:
#df_users = pd.read_csv('data/train_triplets.txt', sep='\t', header=None, names=['user_id', 'song_id', 'play_count'])
df_users = pd.read_csv('data/User Listening History_modified.csv')

In [3]:
df_music = pd.read_csv('data/Million Song Dataset kaggle/Music Info.csv')

In [4]:
df_users.shape

(8332242, 3)

In [5]:
num_distinct_users = df_users['user_id'].nunique()
num_distinct_users

464573

In [6]:
df_music.head()

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [7]:
df_music_info = df_music[['track_id', 'name', 'artist', 'energy']]
df_music_info.head()

Unnamed: 0,track_id,name,artist,energy
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,0.918
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,0.892
2,TROUVHL128F426C441,Come as You Are,Nirvana,0.826
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,0.664
4,TRLNZBD128F935E4D8,Creep,Radiohead,0.43


We load the asigned cluster for each song previously calculated

In [8]:
id_to_cluster = pd.read_csv('data/track_clusters.csv', index_col=0).iloc[:, 0]

In [9]:
id_to_cluster

track_id
TRIOREW128F424EAF0    4
TRRIVDJ128F429B0E8    4
TROUVHL128F426C441    3
TRUEIND128F93038C4    2
TRLNZBD128F935E4D8    2
                     ..
TRQYCFV128F9322F50    3
TRHQCSH128F42724B7    4
TRZRODK128F92D68D7    0
TRGLMEM128F9322F63    3
TRIPFKO128F42383FE    2
Name: 0, Length: 50683, dtype: int64

## Interaction Matrix

In [10]:
df_users_agg = df_users.groupby('user_id')['playcount'].agg(
    #total_playcount='sum',
    max_playcount='max'
).reset_index()
df_users_agg = df_users_agg.rename(columns={'playcount': 'max_playcount'})

In [11]:
df_users_rating = df_users.merge(df_users_agg, on='user_id')

In [12]:
df_users_rating['rating'] = df_users_rating['playcount'] / df_users_rating['max_playcount']

In [13]:
from scipy.sparse import csr_matrix, csc_matrix

We codify each user and song to a unique integer ID

In [14]:
user_codes, user_uniques = pd.factorize(df_users['user_id'])
track_codes, track_uniques = pd.factorize(df_users['track_id'])

In [15]:
ALPHA = 40 #Value proposed in the paper "A Matrix Factorization Approach to Online Recommendation with Implicit Feedback" by Hu et al. (2008)

confidence_values = 1 + ALPHA * df_users_rating['playcount']

interaction_matrix_user_item = csr_matrix(
    (confidence_values, (user_codes, track_codes)),
    shape=(len(user_uniques), len(track_uniques))
)

## Matrix Factorization: Alternating Least Squares (ALS)

In [16]:
from implicit.als import AlternatingLeastSquares

In [17]:
class ALSRecommender:
    def __init__(self, interaction_matrix, track_uniques, df_music_info, als_model=None):
        self.interaction_matrix = interaction_matrix
        self.track_uniques = track_uniques
        self.df_music_info = df_music_info

        if als_model is None:
            self.als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20, num_threads=-1, random_state=RANDOM_STATE)
            self.als_model.fit(self.interaction_matrix)
        else:
            self.als_model = als_model

        self.user_index = None
        self.recommendations = None # List of tuples (track_id, energy, similarity, has been recommended)

    def make_recommendations(self, user_index, n=100):
        self.user_index = user_index

        user_items = self.interaction_matrix.tocsr()[user_index]

        top_n_recommendations_indexes, top_n_recommendations_scores = self.als_model.recommend(user_index, user_items, N=n, filter_already_liked_items=True)

        track_ids = self.track_uniques[top_n_recommendations_indexes].tolist()
        
        df_filtered = self.df_music_info.set_index('track_id').loc[track_ids][['energy']].reset_index()

        self.recommendations = [(track_id, energy, similarity, False) for (track_id, energy), similarity in zip(df_filtered.itertuples(index=False, name=None), top_n_recommendations_scores)]

        return self.recommendations

    
    def recommend_song(self, energy, energy_margin=0.05):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        
        closest_track_index = None
        distance_to_energy = float('inf')

        for i, (track_id, track_energy, similarity, has_been_recommended) in enumerate(self.recommendations):
            distance = abs(track_energy - energy)

            if not has_been_recommended and distance <= energy_margin:
                self.recommendations[i] = (track_id, track_energy, similarity, True)
                return (track_id, track_energy)
            
            if not has_been_recommended and distance < distance_to_energy:
                closest_track_index = i
                distance_to_energy = distance
        
        if closest_track_index is not None:
            track_id, track_energy, _, _= self.recommendations[closest_track_index]
            self.recommendations[closest_track_index] = (track_id, track_energy, similarity, True)
            return (track_id, track_energy)

        raise ValueError("All recommendations have already been recommended")


    def get_recommendations(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return self.recommendations


    def get_recommendations_ids(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return [track_id for track_id, _, _, _ in self.recommendations]
    
    def get_recommendations_info(self):
        track_ids = [track_id for track_id, _, _, _ in self.recommendations]
        df_ordered = self.df_music_info.set_index('track_id').loc[track_ids].reset_index()
        return df_ordered

## Kmeans content-based filtering

In [18]:
class KmeansContentBasedRecommender: #For user history
    def __init__(self, id_to_cluster):
        self.id_to_cluster = id_to_cluster
        self.recommendations = None
    
    def make_cluster_recommendation(self, user_history):
        clusters = self.id_to_cluster[user_history]
        cluster_counts = clusters.value_counts()
        self.recommended_cluster = cluster_counts / len(clusters)
        return self.recommended_cluster

    def get_recommended_cluster(self):
        if self.recommended_cluster is None:
            raise ValueError("No cluster recommendation available. Please call make_cluster_recommendation first.")
        return self.recommendations
    

## Hybrid recommender

In [35]:
class HybridRecommender:
    def __init__(self, interaction_matrix, track_uniques, df_music_info, df_users, id_to_cluster, als_recommender = None, content_based_recommender = None, alpha = 1):
        if als_recommender is not None:
            self.collaborative_als_recommender = als_recommender
        else:
            self.collaborative_als_recommender = ALSRecommender(interaction_matrix, track_uniques, df_music_info)
        
        if content_based_recommender is not None:
            self.content_based_recommender = content_based_recommender  
        else:
            self.content_based_recommender = KmeansContentBasedRecommender(id_to_cluster)

        self.df_music_info = df_music_info
        self.df_users = df_users
        self.id_to_cluster = id_to_cluster
        self.alpha = alpha  # Alpha is a parameter to control the influence of content-based recommendations
        self.recommendations = None # List of tuples (track_id, energy, similarity, has been recommended)

    
    def make_recommendations(self, user_index, n=100):

        user_id = df_users['user_id'].unique()[user_index]
        user_history = self.df_users[self.df_users['user_id'] == user_id]['track_id']
        collaborative_recomendations = self.collaborative_als_recommender.make_recommendations(user_index, n)
        content_based_cluster_recommendation = self.content_based_recommender.make_cluster_recommendation(user_history)
        self.recommendations = []
        
        #We will apply a penalization to the collaborative filtering recommendation based on the user cluster preferences obtained by the content-based recommendation
        for track_id, energy, similarity, has_been_recommended in collaborative_recomendations:
            cluster_presence = 0.01 #Default multiplier. Used if the song's cluster is not in the user's cluster preferences (content-based recommendation)
            song_cluster = self.id_to_cluster[track_id]
            if song_cluster in content_based_cluster_recommendation.index:
                cluster_presence = content_based_cluster_recommendation[song_cluster]
            
            #print(track_id, song_cluster, multiplier)

            self.recommendations.append((track_id, energy, similarity * cluster_presence * self.alpha, has_been_recommended))
        self.recommendations = sorted(self.recommendations, key=lambda x: x[2], reverse=True)  # Sort new similarity


    def make_recommendations_only_collaborative(self, user_index, n=100):
        self.recommendations = self.collaborative_als_recommender.make_recommendations(user_index, n)
    
    def recommend_song(self, energy, energy_margin=0.05):
        return self.collaborative_als_recommender.recommend_song(energy, energy_margin)
    
    def get_recommendations(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return self.recommendations
    
    def get_recommendations_ids(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        return [track_id for track_id, _, _, _ in self.recommendations]
    
    def get_recommendations_info(self):
        if self.recommendations is None:
            raise ValueError("No recommendations available. Please call make_recommendations first.")
        track_ids = [track_id for track_id, _, _, _ in self.recommendations]
        df_ordered = self.df_music_info.set_index('track_id').loc[track_ids].reset_index()
        return df_ordered

### Example

In [36]:
user_index = 0 #User for wich recommendations will be made
# user_listened_songs = df_users[df_users['user_id'] == user_uniques[user_index]].track_id
# df_music_info[df_music_info['track_id'].isin(user_listened_songs)]

In [37]:
# als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20, num_threads=-1, random_state=RANDOM_STATE)
# als_model.fit(interaction_matrix_user_item)

# with open('models/als_model.pkl', 'wb') as f:
#     pickle.dump(als_model, f)

In [38]:
with open('models/als_model.pkl', 'rb') as f:
    als_model = pickle.load(f)

In [39]:
als_recommender = ALSRecommender(interaction_matrix_user_item, track_uniques, df_music_info, als_model=als_model)

In [40]:
hybrid_recommender = HybridRecommender(interaction_matrix_user_item, track_uniques, df_music_info, df_users, id_to_cluster, als_recommender=als_recommender)
hybrid_recommender.make_recommendations_only_collaborative(user_index, n=10)
hybrid_recommender.get_recommendations_info().head(20)

Unnamed: 0,track_id,name,artist,energy
0,TROWQGL128E0784BAA,Taylor,Jack Johnson,0.725
1,TRXMJMD128F146AE5B,Don't Worry Be Happy,Bobby McFerrin,0.205
2,TRPTIGH128C71968FA,Love Comes Tumbling,U2,0.413
3,TRCQYJE128E078FA49,The News,Jack Johnson,0.0818
4,TRWAUCC128F428E6FD,Staple It Together,Jack Johnson,0.794
5,TRFWGOJ128E0780C8B,In My Place,Coldplay,0.566
6,TRCIVWB128F92FDBB3,Strawberry Swing,Coldplay,0.669
7,TRZCIWG128F4248B25,Lost!,Coldplay,0.779
8,TRBVOHY128F92E6A11,There Must Be A Better World Somewhere,B.B. King,0.722
9,TRMRHPL128F42890D1,Not Falling Apart,Maroon 5,0.714


In [41]:
hybrid_recommender.make_recommendations(user_index, n=10)
hybrid_recommender.get_recommendations_info().head(20)

Unnamed: 0,track_id,name,artist,energy
0,TROWQGL128E0784BAA,Taylor,Jack Johnson,0.725
1,TRXMJMD128F146AE5B,Don't Worry Be Happy,Bobby McFerrin,0.205
2,TRPTIGH128C71968FA,Love Comes Tumbling,U2,0.413
3,TRWAUCC128F428E6FD,Staple It Together,Jack Johnson,0.794
4,TRBVOHY128F92E6A11,There Must Be A Better World Somewhere,B.B. King,0.722
5,TRCQYJE128E078FA49,The News,Jack Johnson,0.0818
6,TRFWGOJ128E0780C8B,In My Place,Coldplay,0.566
7,TRMRHPL128F42890D1,Not Falling Apart,Maroon 5,0.714
8,TRCIVWB128F92FDBB3,Strawberry Swing,Coldplay,0.669
9,TRZCIWG128F4248B25,Lost!,Coldplay,0.779
