In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [22]:
user_data = pd.read_table("user_data.txt", header=None)
user_data.columns = ['user_id', 'song_id', 'listens']

song_data = pd.read_csv("song_data.csv")

songs_data = pd.merge(user_data, song_data.drop_duplicates(['song_id']), on="song_id", how="left")
songs_data.columns = ['user_id', 'song_id', 'listens', 'title', 'album', 'artist_name', 'year']

#print(list(songs_data.columns))
songs_data.head()


Unnamed: 0,user_id,song_id,listens,title,album,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [23]:
song_sample = songs_data.sample(n=20000)

song_grouped = song_sample.groupby(['title']).agg({'listens': 'count'}).reset_index()
grouped_sum = song_grouped['listens'].sum()
song_grouped['percentage']  = song_grouped['listens'].div(grouped_sum)*100
song_grouped.sort_values(['listens', 'title'], ascending = [0,1])

song_grouped.head()

Unnamed: 0,title,listens,percentage
0,#40,5,0.025
1,& Down,4,0.02
2,'Round Midnight,3,0.015
3,'Til We Die (Album Version),3,0.015
4,'Till I Collapse,7,0.035


In [24]:
users = song_sample['user_id'].unique()
print(len(users))
songs = song_sample['title'].unique()
print(len(songs))

15362
7076


In [98]:
class CollaborativeBasedRecommender():
    def _init_(self):
        self.train_data = None
        self.user_id = None
        self.song_id = None
        self.cooc_matrix = None
        self.artists = None
        self.song_artist_dict = None
      
    # retrive song preferences for a given user
    def get_user_songs(self, user):
        data = self.train_data[self.train_data[self.user_id] == user]
        songs = list(data[self.song_id].unique())
        return songs
    
    # retrieve users who like a given song
    def get_song_users(self, song):
        data = self.train_data[self.train_data[self.song_id] == song]
        users = set(data[self.user_id].unique())
        return users
    
    def create_cooc_matrix(self, user_songs, all_songs):
        # get users for each song from user's songs
        songs_users = []
        for i in range(0, len(user_songs)):
            songs_users.append(self.get_song_users(user_songs[i]))
        
        # initialize cooccurence matrix
        user_song_len = len(user_songs)
        all_song_len = len(all_songs)
        cooccurence_matrix = np.matrix(np.zeros(shape=(user_song_len, all_song_len)), float)
        
        for i in range(all_song_len):
            #find unique users of song i
            songs_data = self.train_data[self.train_data[self.song_id] == all_songs[i]]
            users = set(songs_data[self.user_id].unique())
            
            for j in range(user_song_len):       
                    
                #find unique users of song j
                users_2 = songs_users[j]
                    
                #find intersection of listeners of both songs
                users_inter = users.intersection(users_2)
                
                #compute cooccurence_matrix[i,j] if intersection exists
                if len(users_inter) != 0:
                    #compute ratio users who listen to both songs to those who listen to either
                    cooccurence_matrix[j,i] = float(len(users_inter))/float(len(users.union(users_2)))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        self.cooc_matrix = cooccurence_matrix
        return cooccurence_matrix
    
    def get_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non-zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        avgs = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_scores = np.array(avgs)[0].tolist()
        
        sort_index = sorted(((a,b) for b,a in enumerate(list(user_scores))), reverse=True)
        #HERE
        df = pd.DataFrame(columns=['user_id', 'song', 'artist', 'score'])
        
        # recommend top 10 songs
        rank = 1 
        for i in range(len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                #HERE
                df.loc[len(df)]=[user, all_songs[sort_index[i][1]], self.song_artist_dict[all_songs[sort_index[i][1]]], sort_index[i][0]]
                rank += 1
        
        #if no recommendations
        if df.shape[0] == 0:
            print("The user has no recommended songs.")
            return -1
        else:
            return df
        
    def create_model(self, user_id, song_id, train_data, artists):
        self.user_id = user_id
        self.song_id = song_id
        self.train_data = train_data
        self.artists = artists
    
    #print recommendation based on user preferences
    def make_recommendation(self, user):
        #retrieve songs of input user
        user_songs = self.get_user_songs(user)     
        print("Number of songs for user: %d" % len(user_songs))
        
        #retrieve unique songs from training data
        #all_songs = list(self.train_data[self.song_id].unique())
        all_songs = list(self.train_data[self.song_id])
        print("Number of songs in the training data: %d" % len(all_songs))
        artists_test = list(self.train_data[self.artists])
        print("Number of artists in training data: %d" % len(artists_test))
        
        #create song artist dictionary
        song_artist_dict = {}
        for i in range(len(all_songs)):
            song_artist_dict[all_songs[i]] = artists_test[i]
        
        self.song_artist_dict = song_artist_dict
        all_songs = list(song_artist_dict.keys())
        
        cooccurence_matrix = self.create_cooc_matrix(user_songs, all_songs)
        self.cooc_matrix = cooccurence_matrix
        
        return self.get_recommendations(user, cooccurence_matrix, all_songs, user_songs)
    
    def get_similar_songs(self, songs):
        all_songs = list(self.train_data[self.song].unique())
        print("Number of songs in the training data: %d" % len(all_songs))
        
        cooccurence_matrix = self.create_cooc_matrix(songs, all_songs)
        
        user = ""
        df = self.get_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df
        

In [99]:
train_data, test_data = train_test_split(song_sample, test_size = 0.20, random_state=0)

model = CollaborativeBasedRecommender()
model.create_model('user_id', 'title', train_data, 'artist_name')

In [122]:
#Print the songs for the user in training data
user_id = users[23]
user_songs = model.get_user_songs(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user with user id: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for song in user_songs:
    print(song)

print("----------------------------------------------------------------------")
print("Recommended Songs:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
model.make_recommendation(user_id)


------------------------------------------------------------------------------------
Training data songs for the user with user id: de0944de8b916ccf800a7b8ce7803a46166c5c4e:
------------------------------------------------------------------------------------
Apologize
Give It Back
----------------------------------------------------------------------
Recommended Songs:
----------------------------------------------------------------------
Number of songs for user: 2
Number of songs in the training data: 16000
Number of artists in training data: 16000
Non-zero values in cooccurence_matrix :11


Unnamed: 0,user_id,song,artist,score
0,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Kicking And Screaming,Miley Cyrus,0.125
1,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Walking Away,Lifehouse,0.083333
2,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Learning To Fall,Boys Like Girls,0.071429
3,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Bold As Love,John Mayer,0.071429
4,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Situations,Escape The Fate,0.0625
5,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Arizona,Kings Of Leon,0.0625
6,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Alejandro,Lady GaGa,0.015152
7,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Adir Adirim (Featuring Victoria Hanna),Balkan Beat Box,0.0
8,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Lewis Takes Off His Shirt,Owen Pallett,0.0
9,de0944de8b916ccf800a7b8ce7803a46166c5c4e,Ella y yo (featuring Don Omar),Aventura,0.0
