In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymongo
from pymongo import MongoClient
import re
from collections import defaultdict

from surprise import SVD, NormalPredictor
from surprise import AlgoBase
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import accuracy

In [None]:
mongo = MongoClient()

db = mongo['project-05']

db_user = db['Usernames']

In [None]:
listens_df = pd.DataFrame(columns=['Username', 'Track ID', 'Playcount'])

In [None]:
counter = 0

cursor = db_user.find(batch_size=50, limit=500, no_cursor_timeout=True)

for document in cursor:
    if document['Tracks'] is None or len(document['Tracks']) == 0:
        print(document)
        db_user.delete_one(document)
    else:
        # Playcounts used as ratings, weighted by the most played track (highest value possible is 1)
        max_plays = int(document['Tracks'][0][2])
        for i in range(len(document['Tracks'])):
            string = (document['Tracks'][i][0] + '_' + document['Tracks'][i][1]).replace(' ', '_')
            trackid = re.sub('[^A-Za-z0-9_]', '', string=string)
            username = document['Username']
            weight = np.log(int(document['Tracks'][i][2]) / max_plays) + 10
            series = pd.Series([username, trackid, weight], index=listens_df.columns)
            listens_df = listens_df.append(series, ignore_index=True)
    counter += 1
    
    print(counter)
    
cursor.close()

In [None]:
listens_df.describe()

In [None]:
reader = Reader(rating_scale=(0, 10))

In [None]:
data = Dataset.load_from_df(listens_df[["Username", "Track ID", "Playcount"]], reader)

# Dumb Model

With every good model comes a dumb model for comparison. In order to see just how good our future models perform, we will set up a dumb algorithm that predicts the average rating for all items in the database. Thus, the predicted rating will be the same no matter what we do.

In [None]:
class DumbAlgo(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):

        # Need to call base method first again
        AlgoBase.fit(self, trainset)

        # Compute the average rating. This assumes ratings come in the (userid, itemid, rating) format
        self.the_mean = np.mean([r for (_, _, r) in self.trainset.all_ratings()])

        return self

    def estimate(self, u, i):

        return self.the_mean

In [None]:
dumb_algo = DumbAlgo()

trainset, testset = train_test_split(data, test_size=.25)

dumb_algo.fit(trainset)
dumb_predictions = dumb_algo.test(testset)

In [None]:
accuracy.rmse(dumb_predictions)

In [None]:
dumb_cv = cross_validate(dumb_algo, data, measures=['RMSE'], cv=5, verbose=True)

# Normal Predictor

Another dumb model, this time assuming the ratings fall along a normal distribution and making predictions based on that.

In [None]:
norm_algo = NormalPredictor()

trainset, testset = train_test_split(data, test_size=.25)

norm_algo.fit(trainset)
norm_predictions = norm_algo.test(testset)

In [None]:
accuracy.rmse(norm_predictions)

In [None]:
cross_validate(norm_algo, data, measures=['RMSE'], cv=5, verbose=True)

# SVD Algorithm

Below we shall check our parameters and make recommendations using the SVD algorithm

In [None]:
param_grid = {
    "n_epochs": [10, 15, 20],
    "lr_all": [0.0025, 0.005, 0.0075],
    "reg_all": [0.2, 0.5, 0.8]
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5, verbose=True)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

print(gs.best_score["mae"])
print(gs.best_params["mae"])

In [None]:
algo = SVD(n_epochs=20, lr_all=0.0025, reg_all=0.2, verbose=True)

# let's do train-test-split, where test set is 25% of the ratings
trainset, testset = train_test_split(data, test_size=.25)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

In [None]:
accuracy.rmse(predictions)

In [None]:
svd_cv = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

# Plotting Error

In [None]:
models = ['Dummy', 'SVD']

dumb_array = dumb_cv['test_rmse']
svd_array = svd_cv['test_rmse']

dumb_mean = np.mean(dumb_array)
svd_mean = np.mean(svd_array)

rmse = [dumb_mean, svd_mean]

dumb_std = np.std(dumb_array)*3
svd_std = np.std(svd_array)*3

error = [dumb_std, svd_std]

x_pos = np.arange(2)

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax.bar(x_pos, rmse, align='center', alpha=0.8, color='#657e93', capsize=15)
ax.set_ylabel('Root Mean Square Error (RMSE)', fontsize=14)
ax.set_xticks(x_pos)
ax.set_xticklabels(models, fontsize=14)
ax.set_title('Model Error Comparison', fontsize=18)

# Save the figure and show
plt.tight_layout()
plt.savefig('bar_plot_with_error_bars.png')
plt.show()

# Example of Recommendations

In [None]:
# Random user selected

listens_df[listens_df['Username'] == 'aaron250401'].head(15)

In [None]:
# In order to make predictions, we will create an anti-set which only contains user/item pairings without existing ratings

pred_algo = SVD(n_epochs=20, lr_all=0.0025, reg_all=0.2, verbose=True)

pred_trainset = data.build_full_trainset()

pred_algo.fit(pred_trainset)

In [None]:
pred_testset = pred_trainset.build_anti_testset()
predictions_test = pred_algo.test(pred_testset)

In [None]:
def get_top_n(user, predictions, n=10):
    '''Return the top-N recommendation for a user from a set of predictions.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n[user]

In [None]:
get_top_n('aaron250401', predictions, n=10)