# Building a song recommender

In [None]:
%matplotlib inline

import pandas
from sklearn.cross_validation import train_test_split
import numpy as np
import time
from sklearn.externals import joblib
import Recommenders as Recommenders
import Evaluation as Evaluation

# Load music data

In [None]:
#song_sf = graphlab.SFrame('song_data.gl/')
#song_sf.save('song_data.csv', format = 'csv')
song_df =  pandas.read_csv('song_data.csv')

# Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [None]:
song_df.head()

## Length of the dataset

In [None]:
len(song_df)

## Create a subset of the dataset

In [None]:
song_df = song_df.head(10000)

## Showing the most popular songs in the dataset

In [None]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

## Count number of unique users in the dataset

In [None]:
users = song_df['user_id'].unique()

In [None]:
len(users)

## Quiz 1. Count the number of unique songs in the dataset

In [None]:
###Fill in the code here

# Create a song recommender

In [None]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)
print(train_data.head(5))

## Simple popularity-based recommender class (Can be used as a black box)

In [None]:
#Recommenders.popularity_recommender_py

### Create an instance of popularity based recommender class

In [None]:
pm = Recommenders.popularity_recommender_py()
pm.create(train_data, 'user_id', 'song')

### Use the popularity model to make some predictions

In [None]:
user_id = users[0]
pm.recommend(user_id)

### Quiz 2: Use the popularity based model to make predictions for the following user id (Note the difference in recommendations from the first user id).

In [None]:
user_id = users[3]
###Fill in the code here

## Build a song recommender with personalization

We now create an item similarity based collaborative filtering model that allows us to make personalized recommendations to each user. 

## Class for an item similarity based personalized recommender system (Can be used as a black box)

In [None]:
#Recommenders.item_similarity_recommender_py

### Create an instance of item similarity based recommender class

In [None]:
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'song')

### Use the personalized model to make some song recommendations

In [None]:
#Print the songs for the user in training data
user_id = users[0]
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)

### Quiz 3. Use the personalized model to make recommendations for the following user id. (Note the difference in recommendations from the first user id.)

In [None]:
user_id = users[3]
#Fill in the code here

### We can also apply the model to find similar songs to any song in the dataset

In [None]:
is_model.get_similar_items(['The Scientist - Coldplay'])

### Quiz 4. Use the personalized recommender model to get similar songs for the following song.

In [None]:
song = 'Easily (Album Version) - Red Hot Chili Peppers'
###Fill in the code here

# Quantitative comparison between the models

We now formally compare the popularity and the personalized models using precision-recall curves. 

## Class to calculate precision and recall (This can be used as a black box)

In [None]:
#Evaluation.precision_recall_calculator

## Use the above precision recall calculator class to calculate the evaluation measures

In [None]:
start = time.time()

#Define what percentage of users to use for precision recall calculation
user_sample = 0.04

#Instantiate the precision_recall_calculator class
pr = Evaluation.precision_recall_calculator(test_data, train_data, pm, is_model)

#Call method to calculate precision and recall values
(pm_avg_precision_list, pm_avg_recall_list, ism_avg_precision_list, ism_avg_recall_list) = pr.calculate_measures(user_sample)

end = time.time()
print(end - start)

## Code to plot precision recall curve

In [None]:
import pylab as pl

#Method to generate precision and recall curve
def plot_precision_recall(m1_precision_list, m1_recall_list, m1_label, m2_precision_list, m2_recall_list, m2_label):
    pl.clf()    
    pl.plot(m1_recall_list, m1_precision_list, label=m1_label)
    pl.plot(m2_recall_list, m2_precision_list, label=m2_label)
    pl.xlabel('Recall')
    pl.ylabel('Precision')
    pl.ylim([0.0, 0.20])
    pl.xlim([0.0, 0.20])
    pl.title('Precision-Recall curve')
    #pl.legend(loc="upper right")
    pl.legend(loc=9, bbox_to_anchor=(0.5, -0.2))
    pl.show()


In [None]:
print("Plotting precision recall curves.")

plot_precision_recall(pm_avg_precision_list, pm_avg_recall_list, "popularity_model",
                      ism_avg_precision_list, ism_avg_recall_list, "item_similarity_model")

In [None]:
print("Plotting precision recall curves for the whole dataset (user sample = 0.0005).")

#Read the persisted files 
pm_avg_precision_list = joblib.load('pm_avg_precision_list.pkl')
pm_avg_recall_list = joblib.load('pm_avg_recall_list.pkl')
ism_avg_precision_list = joblib.load('ism_avg_precision_list.pkl')
ism_avg_recall_list = joblib.load('ism_avg_recall_list.pkl')

print("Plotting precision recall curves.")
plot_precision_recall(pm_avg_precision_list, pm_avg_recall_list, "popularity_model",
                      ism_avg_precision_list, ism_avg_recall_list, "item_similarity_model")

The curve shows that the personalized model provides much better performance over the popularity model. 