# Authors
 - Nwamaka Nzeocha
 - Fabian Okeke

# Recommendation System Datasets

- [MovieLens 10M data set](http://grouplens.org/datasets/movielens/10m/)
- [MovieLens 22M data set](http://grouplens.org/datasets/movielens/latest/)
- [Million song data set](http://labrosa.ee.columbia.edu/millionsong/tasteprofile)

# ======== PART 1 ========


### Adding relevant functions

In [47]:
import os
import sys
import contextlib

from math import sqrt 
from operator import add
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel


def exists(filepath):
    return os.path.exists(filepath)


def parse_rating(line):
    """
    Parses a rating record that's in MovieLens format.
    
    :param str line: userId::movieId::rating::timestamp
    """
    fields = line.strip().split("::")

    return (int(fields[0]),   # User ID
            int(fields[1]),   # Movie ID
            float(fields[2])) # Rating


def compute_rmse(model, data, dataCount, bias=None):
    """
    Compute RMSE (Root Mean Squared Error).
    :param object model
    :param list data
    :param integer validation_count
    :biasDict: biased values to be added back
    """
    predictions = model.predictAll(data.map(lambda x: (x[0], x[1]))) #userId and #movieId

    if type(bias) == float:
        predictions = predictions.map(lambda(u,m,r): (u,m,r+bias))
    elif type(bias) == dict:
        predictions = predictions.map(lambda(u,m,r): (u,m,r+bias.get(0,)))
        
    predictionsAndRatings = \
        predictions.map(lambda x: ((x[0], x[1]), x[2])) \
                   .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \
                   .values()
    return sqrt(
        predictionsAndRatings.map(
            lambda x: (x[0] - x[1]) ** 2
        ).reduce(add) / float(dataCount)
    )


def getBestTrainingParameters(training, validation, validationCount, biasDict=None, isImplicit=False):
    """
        Train ALS model using different regularization parameter and latent factors
    """ 

    #rank_list = [10, 20, 30] # latent factor
    rank_list = [10]
    #lamda_list = [0.01, 0.1, 1.0] # regularization parameter
    lamda_list = [0.01]
    iterations = 5
    bestModel, bestRMSE, bestRank, bestLamda = None, float("inf"), None, None

    for rank in rank_list:
        for lamda in lamda_list:
            
            if isImplicit:
                model = ALS.trainImplicit(training, rank, iterations, lamda)
            else:
                model = ALS.train(training, rank, iterations, lamda)
                
            rmse = compute_rmse(model, validation, validationCount, biasDict)
            if rmse < bestRMSE:
                bestModel, bestRMSE, bestRank, bestLamda = model, rmse, rank, lamda

            print 'RMSE={}: Rank={}, Lambda={}'.format(rmse, rank, lamda)
    
    return (bestModel,bestRMSE,bestRank,bestLamda)


def generate_recommendations(model, ratingsFile, numOfRec=5, isRecommendingSongs=False):
    """
         use a trained ALS model(explicit/implicit) and a ratingsFile to predict movies/songs for a user
    """
    
    recommendations = None
    
    if (isRecommendingSongs):
        usedItems = sc.textFile(ratingsFile)\
            .map(parse_song)\
            .map(lambda x: x[1])\
            .collect()
        
        unseenItems = sc.textFile('train_triplets.txt')\
            .map(parse_song)\
            .filter(lambda x: x[1] not in usedItems)\
            .map(lambda x: (x[1], 1))
            
        predictions = model.predictAll(unseenItems)
        predictions = predictions.top(numOfRec, key=lambda x: x[2]) # sort by desc playCount
        
        # allSongValues and allSongKeys are global variables
        allSongKeys, allSongValues = allSong.keys(), allSong.values()
        recommendations = [allSongKeys[allSongValues.index(v)] for v in predictions] # get str IDs from int IDs

    else:
        usedItems = sc.textFile(ratingsFile) \
            .filter(lambda x: x and len(x.split('::')) == 4) \
            .map(parse_rating) \
            .map(lambda x: x[1])\
            .collect()

        unseenItems = sc.textFile('ml-10M100K/ratings.dat')\
            .filter(lambda x: x and len(x.split('::')) == 4)\
            .map(parse_rating)\
            .filter(lambda x: x[1] not in usedItems)\
            .map(lambda x: (x[1], 1))

        predictions = model.predictAll(unseenItems)
        predictions = predictions.top(numOfRec, key=lambda x: x[2]) # sort by desc ratings
            
        movies = ''
        with open('ml-10M100K/movies.dat', 'r') as open_file:
            movies = {int(line.split('::')[0]): line.split('::')[1]
                  for line in open_file
                  if len(line.split('::')) == 3}

        recommendations = []
        for movieId, _, _ in predictions:
            if movieId in movies:
                recommendations.append(movies[movieId])         
        
    return recommendations


### Split movie dataset into 60-20-20 train-validate-test partitions

In [48]:
if (exists('ml-10M100K/train60.dat') and exists('ml-10M100K/validation20.dat') and exists('ml-10M100K/test20.dat')):
    print "Already created files: train60.dat, validation20.dat, test20.dat"    

else:
    # sort by timestamp (4th column)
    print 'sorting file...'
    !sort -t ':' -k4 ml-10M100K/ratings.dat > ml-10M100K/new_ratings.dat 
    print "sorting complete."
    
    # split into 5 parts of 2 million each: train(3 parts), validation (1 part), test (1 part)
    print "splitting file..."
    !split -l 2000000 ml-10M100K/new_ratings.dat ff
    !cat ffaa ffab ffac > ml-10M100K/train60.dat
    !mv ffad ml-10M100K/validation20.dat
    !mv ffae ml-10M100K/test20.dat
    
    # remove tmp files used to create partitions
    !rm new_ratings.dat ff*
    print "splitting complete."    
    print "Newly created files: train60.dat, validation20.dat, test20.dat"

Already created files: train60.dat, validation20.dat, test20.dat


### Load movie files

In [14]:
training = sc.textFile('ml-10M100K/train60.dat') \
         .filter(lambda x: x and len(x.split('::')) == 4) \
         .map(parse_rating)

In [15]:
validation = sc.textFile('ml-10M100K/validation20.dat') \
         .filter(lambda x: x and len(x.split('::')) == 4) \
         .map(parse_rating)

In [16]:
test = sc.textFile('ml-10M100K/test20.dat') \
         .filter(lambda x: x and len(x.split('::')) == 4) \
         .map(parse_rating)

In [None]:
trainCount = training.count()
trainCount

In [None]:
validationCount = validation.count()
validationCount

In [None]:
testCount = test.count()
testCount

In [None]:
training.take(3)

In [None]:
validation.take(3)

In [None]:
test.take(3)

### Create best ALS explicit model or load if already exists for movies

In [33]:
chosenMovieModel, movieResults = None, None
if exists("chosenMovieModel"):
    print "Loading chosenMovieModel since it exists."
    chosenMovieModel = MatrixFactorizationModel.load(sc, "chosenMovieModel")
    print "chosenMovieModel loaded."
else:
    movieResults = getBestTrainingParameters(chosenMovieModel, validation, validationCount)
    chosenMovieModel = movieResults[0]
    chosenMovieModel.save(sc, "chosenMovieModel")
    print "chosenMovieModel created"
    print "movie best rmse:", movieResults[1]

Loading chosenMovieModel since it exists.
chosenMovieModel loaded.


### Create ratings file that contains movie ratings for one user

In [41]:
if exists('ml-10M100K/user01Ratings.dat'):
    print "User01Ratings.dat file already exists."
else:
    user01Ratings = sc.textFile('ml-10M100K/ratings.dat').filter(lambda x: x.split('::')[0] == '1') # userId == 1
    user01Ratings.saveAsTextFile('ml-10M100K/user01Ratings.dat')

User01Ratings.dat file already exists.


In [None]:
sc.textFile('ml-10M100K/ratings.dat').filter(lambda x: x.split('::')[0] == '1').take(30) # userId == 1

### Generate movie recommendations for a single user

In [None]:
ratingsFile = 'ml-10M100K/user01Ratings.dat'
generate_recommendations(chosenMovieModel, ratingsFile)

# ======== PART 2 ========

## Remove Global Bias/User Bias/Item Bias

### Global average bias

In [None]:
sumCount = training.map(lambda (u,m,r): (m,r)).combineByKey(lambda value: (value, 1),
                             lambda x, value: (x[0] + value, x[1] + 1),
                             lambda x, y: (x[0] + y[0], x[1] + y[1]))

globalAvg = sumCount.map(lambda (label, (value_sum, count)): (label, round(value_sum / count, 3))) # 3 dp
globalAvg = globalAvg.collectAsMap() # dict

sumKeys, sumValues = 0, 0
for k,v in enumerate(globalAvg):
    sumKeys += k
    sumValues += v

globalAvg = float(sumKeys)/sumValues
print "globalAvg:", globalAvg

### Remove global average bias

In [None]:
trainingWithoutGlobalAvg = training.map(lambda (u,m,r): (u,m,r-globalAvg))
trainingWithoutGlobalAvg.take(3)

In [None]:
globalAvgResults = getBestTrainingParameters(trainingWithoutGlobalAvg, validation, validationCount, globalAvg)

In [None]:
print "rmse after treating avg ratings bias:", globalAvgResults[1]

### Item/Movie bias

In [None]:
sumCount = training.map(lambda (u,m,r): (m,r)).combineByKey(lambda value: (value, 1),
                             lambda x, value: (x[0] + value, x[1] + 1),
                             lambda x, y: (x[0] + y[0], x[1] + y[1]))

itemBias = sumCount.map(lambda (label, (value_sum, count)): (label, round(value_sum / count, 3))) # 3 dp
itemBias = itemBias.collectAsMap() # dict

# show n keys
i, N = 0, 5
for k,v in enumerate(itemBias):
    print k, ":", v
    i += 1
    if i == N: break

### Remove item bias

In [None]:
trainingWithoutItemBias = training.map(lambda (u,m,r): (u,m,r-globalAvg[m]))
itemBiasResults = getBestTrainingParameters(trainingWithoutItemBias, validation, validationCount, itemBias)

In [None]:
print "best rmse (item bias):", itemBiasResults[1]

### User bias

In [None]:
sumCount = training.map(lambda (u,m,r): (u,r)).combineByKey(lambda value: (value, 1),
                             lambda x, value: (x[0] + value, x[1] + 1),
                             lambda x, y: (x[0] + y[0], x[1] + y[1]))

userBias = sumCount.map(lambda (label, (value_sum, count)): (label, round(value_sum / count, 3))) # 3 dp
userBias = userBias.collectAsMap() # dict

# show n keys
i, N = 0, 5
for k,v in enumerate(userBias):
    print k, ":", v
    i += 1
    if i == N: break

### Remove user bias

In [None]:
trainingWithoutUserBias = training.map(lambda (u,m,r): (u,m,userBias.get(u,0.409))) #replace with avg when no rating
userBiasResults = getBestTrainingParameters(trainingWithoutUserBias, validation, validationCount, userBias)

In [None]:
print "best rmse (user bias):", userBiasResults[1]

# ======== PART 3 ========

### Split song dataset into 60-20-20

In [None]:
if (exists('songTrain60.txt') and exists('songValidation20.txt') and exists('songTest20.txt')):
    print "Already created files: songTrain60.txt, songValidation20.txt, songTest20.txt"    

else:
    # split into chunks of 3.2 million each (total dataset: 48373586 lines)
    print "splitting file..."
    !split -l 3200000 train_triplets.txt ff
    
    !cat ffae ffaj ffab ffai ffaf ffad ffam ffac ffah > songTrain60.txt
    !rm ffae ffaj ffab ffai ffaf ffad ffam ffac ffah
    
    !cat ffal ffag ffaa > songValidation20.txt
    !rm ffal ffag ffaa
    
    !cat ff* > songTest20.txt
    !rm ff*

    print "splitting complete."    
    print "Newly created files: songTrain60.txt, songValidation20.txt, songTest20.txt"

In [29]:
chosenSongModel, songResults = None, None
if exists("chosenSongModel"):
    print "chosenSongModel loaded since it exists."
    chosenSongModel = MatrixFactorizationModel.load(sc, "chosenSongModel")
else:
    songResults = getBestTrainingParameters(binarySongs, songValidation, songValidationCount, isImplicit=True)
    chosenSongModel = songResults[0]
    chosenSongModel.save(sc, "chosenSongModel")
    print "chosenSongModel created"
    print "songResults best rmse:", songResults[1]

chosenSongModel loaded since it exists.


### Convert str ids to int ids

In [18]:
allUsers = {}
allSongs = {}

def parse_song(line):
    """
        convert all str input to integer values
        every user gets an integer id and every song gets an integer id
    """
    userId, songId, playCount = line.split("\t")
    
    allUsers[userId] = allUsers.get(userId, len(allUsers) + 1);
    allSongs[songId] = allSongs.get(songId, len(allSongs) + 1);

    return (allUsers[userId], # user ID
            allSongs[songId], # song ID
            int(playCount))   # playcount

In [131]:
songTraining = sc.textFile('songTrain60.txt').map(parse_song)
songTraining.take(3)

[(1, 1, 1), (1, 2, 1), (1, 3, 1)]

In [132]:
songValidation = sc.textFile('songValidation20.txt').map(parse_song)
songValidation.take(3)

[(1, 1, 1), (1, 2, 4), (1, 3, 6)]

In [133]:
songTest = sc.textFile('songTest20.txt').map(parse_song)
songTest.take(3)

[(1, 1, 1), (1, 2, 7), (1, 3, 1)]

In [134]:
songTrainingCount = songTraining.count()
songTrainingCount

28800000

In [135]:
songValidationCount = songValidation.count()
songValidationCount

9600000

In [136]:
songTestCount = songTest.count()
songTestCount

9973586

### Convert play count to binary ratings

In [138]:
binarySongs = songTraining.map(lambda(uid,sid,pc): (uid,sid,1) if pc > 5 else (uid,sid,0)) #userId, songId, playCount
binarySongs.take(3)

[(1, 1, 0), (1, 2, 0), (1, 3, 0)]

### Create ALS implicit model  or load if already exists

In [29]:
chosenSongModel, songResults = None, None
if exists("chosenSongModel"):
    print "chosenSongModel loaded since it exists."
    chosenSongModel = MatrixFactorizationModel.load(sc, "chosenSongModel")
else:
    songResults = getBestTrainingParameters(binarySongs, songValidation, songValidationCount, isImplicit=True)
    chosenSongModel = songResults[0]
    chosenSongModel.save(sc, "chosenSongModel")
    print "chosenSongModel created"
    print "songResults best rmse:", songResults[1]

chosenSongModel loaded since it exists.


### Create single user ratings file

In [221]:
user01Songs = sc.textFile('train_triplets.txt')  #(user, song, play count) 
user01Songs = user01Songs.filter(lambda x: x.split('\t')[0] == 'b80344d063b5ccb3212f76538f3d9e43d87dca9e') # userId
if exists('user01Songs.txt'):
    print "user01Songs.txt already exists."
else:
    user01Songs.saveAsTextFile('user01Songs.txt')
    print "user01Songs.txt created."

user01Songs.txt already exists.


### Generate user song recommendations

In [None]:
songRatingsFile = 'user01Songs.txt'
songsRecommended = generate_recommendations(chosenSongModel, songRatingsFile, isRecommendingSongs=True)

In [None]:
Print "The id of songs recommended:", songsRecommended