# Objective: To Build a song Recommender

# Dataset used: Million Songs Dataset 
    
Source: http://labrosa.ee.columbia.edu/millionsong/ 

# Q1. Load Required Libraries and music data

In [1]:
%matplotlib inline

import pandas
from sklearn.model_selection import train_test_split
import numpy as np
import time
import joblib

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load Music data

In [13]:
#Read userid-songid-listen_count triplets
#This step might take time to download data from external sources
triplets_file = 'dataset/kaggle_visible_evaluation_triplets.txt'
songs_metadata_file = 'dataset/kaggle_songs.txt'

song_df_1 = pandas.read_table(triplets_file,header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

#Read song  metadata
song_df_2 =  pandas.read_csv(songs_metadata_file,header=None)
song_df_2.columns = ['song_id']


#Merge the two dataframes above to create input dataframe for recommender systems
song_df = pandas.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left") 

In [14]:
print(song_df_1.head())
len(song_df_1)

                                    user_id             song_id  listen_count
0  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SOBONKR12A58A7A7E0             1
1  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SOEGIYH12A6D4FC0E3             1
2  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SOFLJQZ12A6D4FADA6             1
3  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SOHTKMO12AB01843B0             1
4  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SODQZCY12A6D4F9D11             1


1450933

In [15]:
song_df_2.head()
len(song_df_2)
print(song_df_2.head(5))
len(song_df)
print(song_df.head(5))

Unnamed: 0,song_id
0,SOAAADD12AB018A9DD 1
1,SOAAADE12A6D4F80CC 2
2,SOAAADF12A8C13DF62 3
3,SOAAADZ12A8C1334FB 4
4,SOAAAFI12A6D4F9C66 5


386213

                song_id
0  SOAAADD12AB018A9DD 1
1  SOAAADE12A6D4F80CC 2
2  SOAAADF12A8C13DF62 3
3  SOAAADZ12A8C1334FB 4
4  SOAAAFI12A6D4F9C66 5


1450933

                                    user_id             song_id  listen_count
0  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SOBONKR12A58A7A7E0             1
1  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SOEGIYH12A6D4FC0E3             1
2  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SOFLJQZ12A6D4FADA6             1
3  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SOHTKMO12AB01843B0             1
4  fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  SODQZCY12A6D4F9D11             1


# Q2. Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [16]:
song_df.head()

Unnamed: 0,user_id,song_id,listen_count
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1


# Length of the dataset

In [17]:
len(song_df)
song_df = song_df.head(10000)
len(song_df)

1450933

10000

# Q3. Create a subset of the dataset

a. Show the most popular songs in the dataset

b. Count number of unique users in the dataset

c. Count the number of unique songs in the dataset

In [None]:
song_df = song_df.head(10000)
#Merge song title and artist_name columns to make a merged column
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']

In [None]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

# Count number of unique users in the dataset

In [21]:
users = song_df['user_id'].unique()
len(users)

753

# Count the number of unique songs in the dataset

In [None]:
###Fill in the code here
songs = song_df['song'].unique()
len(songs)

# Q4. Create a song recommender

In [23]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)
print(train_data.head(5))

                                       user_id             song_id  \
7389  d3c75a5a579f29a3a853b3a4ba76d3a2e5243655  SOQTHZW12A8C1400F8   
9275  3071be7411e636fcda6c99277a5b51c45a7c4866  SODQPTQ12AF72A64BC   
2995  1fa6a4add6eacc3ce1aec44fc37037db2b09bf82  SOUCBEB12A6310E1F9   
5316  c8db3788ca8f60d92abf827ad59f424f05897cfc  SOBWFXM12AB0182808   
356   6530c4fc41b9110de5d39fe0355fa103c66385f0  SOSIZFO12A58A79934   

      listen_count  
7389             2  
9275             6  
2995             3  
5316            14  
356              2  


# Q5. Build Popularity Recommender model. (Non-personalised)

a. Count of user_id for each unique song as recommendation score 

b. Sort the songs on recommendation score 

c. Get the top 5 recommendations

In [24]:
#Count of user_id for each unique song as recommendation score 
train_data_grouped = train_data.groupby('song_id').agg({'user_id': 'count'}).reset_index()
train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
train_data_grouped.head()

Unnamed: 0,song_id,score
0,SOAAFYH12A8C13717A,1
1,SOAAGFH12A8C13D072,1
2,SOAAGRT12AF72A2A6C,1
3,SOAALJB12A8C13C4B6,1
4,SOAAROC12A6D4FA420,1


In [25]:
#Sort the songs on recommendation score 
train_data_sort = train_data_grouped.sort_values(['score', 'song_id'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = train_data_sort.head(5) 
popularity_recommendations 

Unnamed: 0,song_id,score,Rank
378,SOBONKR12A58A7A7E0,34,1.0
209,SOAUWYT12A81C206F1,31,2.0
1399,SOFRQTD12A81C233C0,30,3.0
223,SOAXGDH12A8C13F8A1,24,4.0
4526,SOSXLTC12AF72A7F54,23,5.0


# Q6. Use popularity based recommender model to make predictions and find recommendations for random list of users with inferences

In [26]:
# Use popularity based recommender model to make predictions
def recommend(user_id):     
    user_recommendations = popularity_recommendations 
          
    #Add user_id column for which the recommendations are being generated 
    user_recommendations['userID'] = user_id 
      
    #Bring user_id column to the front 
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols] 
          
    return user_recommendations 

In [27]:
find_recom = [15,121,53]   # This list is user choice.
for i in find_recom:
    print("Here is the recommendation for the userId: %d\n" %(i))
    print(recommend(i))    
    print("\n") 

Here is the recommendation for the userId: 15

      userID             song_id  score  Rank
378       15  SOBONKR12A58A7A7E0     34   1.0
209       15  SOAUWYT12A81C206F1     31   2.0
1399      15  SOFRQTD12A81C233C0     30   3.0
223       15  SOAXGDH12A8C13F8A1     24   4.0
4526      15  SOSXLTC12AF72A7F54     23   5.0


Here is the recommendation for the userId: 121

      userID             song_id  score  Rank
378      121  SOBONKR12A58A7A7E0     34   1.0
209      121  SOAUWYT12A81C206F1     31   2.0
1399     121  SOFRQTD12A81C233C0     30   3.0
223      121  SOAXGDH12A8C13F8A1     24   4.0
4526     121  SOSXLTC12AF72A7F54     23   5.0


Here is the recommendation for the userId: 53

      userID             song_id  score  Rank
378       53  SOBONKR12A58A7A7E0     34   1.0
209       53  SOAUWYT12A81C206F1     31   2.0
1399      53  SOFRQTD12A81C233C0     30   3.0
223       53  SOAXGDH12A8C13F8A1     24   4.0
4526      53  SOSXLTC12AF72A7F54     23   5.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recommendations['userID'] = user_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recommendations['userID'] = user_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recommendations['userID'] = user_id


Since this is a popularity-based recommender model, recommendations remain the same for all users

We predict the products based on the popularity. It is not personalized to particular user