In [1]:
import pyspark
import math
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from settings import *
#from pyechonest import config 
#config.ECHO_NEST_API_KEY
#from pyechonest import song


#install pyechonest for iPython from "https://github.com/echonest/pyechonest"


# Model Evaluation 

In [2]:
rdd =  sc.textFile(MSD).map(lambda line: line.split('\t')).map(lambda x: [str(x[0]), str(x[1]), float(x[2])])

In [3]:
# Creating User Dictionary
userDict = rdd.map(lambda x: (x[0], x[1])).reduceByKey(lambda a,b : 1).collectAsMap()
userCount = 0
for key in userDict.keys():
    userDict[key] = userCount
    userCount+=1

In [4]:
# Creating Song Dictionary
songDict = rdd.map(lambda x: (x[1], x[0])).reduceByKey(lambda a,b : 1).collectAsMap()
songCount = 0
for key in songDict.keys():
    songDict[key] = songCount
    songCount+=1

In [5]:
# Processing data to structure: Rating(user=62510, product=34148, rating=3.0)
train = rdd.zipWithIndex().filter(lambda x: x[-1] < 1000000*0.6).map(lambda x: Rating(userDict[x[0][0]], songDict[x[0][1]], x[0][2]))

In [6]:
print train.count()

600000


In [7]:
# Processing validation data
validationData = rdd.zipWithIndex().filter(lambda x: x[-1] > 1000000*0.6 and x[-1] < 1000000*0.8 and x[0][0] in userDict)
validationData = validationData.map(lambda x: Rating(userDict[x[0][0]], songDict[x[0][1]], x[0][2]))
validation = validationData.map(lambda p: (p[0], p[1]))

In [8]:
print validation.count()

199999


In [9]:
# Training ALS Model
latentFactors = [10,20,30,40,50]
numIterations = 15
regParameter = [0.01,0.1,1.0,10.0]

for i in range(5):
    for j in range(4):
        model = ALS.trainImplicit(train, latentFactors[i], numIterations, regParameter[j])
        predictions = model.predictAll(validation).map(lambda r: ((r[0], r[1]), r[2]))
        ratesAndPreds = validationData.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
        RMSE = ratesAndPreds.map(lambda r: math.sqrt((r[1][0] - r[1][1])**2)).mean()
        print("No. of latent factors: " + str(latentFactors[i]) + "\t Regulation Parameter: " + str(regParameter[j]))
        print("RMSE = " + str(RMSE))
        print "\n"

No. of latent factors: 10	 Regulation Parameter: 0.01
RMSE = 1.66987305012


No. of latent factors: 10	 Regulation Parameter: 0.1
RMSE = 1.66818674869


No. of latent factors: 10	 Regulation Parameter: 1.0
RMSE = 1.67226755025


No. of latent factors: 10	 Regulation Parameter: 10.0
RMSE = 1.67226890756


No. of latent factors: 20	 Regulation Parameter: 0.01
RMSE = 1.66668902432


No. of latent factors: 20	 Regulation Parameter: 0.1
RMSE = 1.65695627574


No. of latent factors: 20	 Regulation Parameter: 1.0
RMSE = 1.67226599272


No. of latent factors: 20	 Regulation Parameter: 10.0
RMSE = 1.67226890756


No. of latent factors: 30	 Regulation Parameter: 0.01
RMSE = 1.64735651008


No. of latent factors: 30	 Regulation Parameter: 0.1
RMSE = 1.62372194883


No. of latent factors: 30	 Regulation Parameter: 1.0
RMSE = 1.67226278263


No. of latent factors: 30	 Regulation Parameter: 10.0
RMSE = 1.67226890756


No. of latent factors: 40	 Regulation Parameter: 0.01
RMSE = 1.63378871304


No. o

# Model Testing

In [6]:
# Processing test data

validationData = rdd.zipWithIndex().filter(lambda x: x[-1] > 1000000*0.6 and x[-1] < 1000000*0.8 and x[0][0] in userDict)
validationData = validationData.map(lambda x: Rating(userDict[x[0][0]], songDict[x[0][1]], x[0][2]))
validation = validationData.map(lambda p: (p[0], p[1]))

testData = rdd.zipWithIndex().filter(lambda x: x[-1] > 1000000*0.8 and x[-1] < 1000000 and x[0][0] in userDict)
testData = testData.map(lambda x: Rating(userDict[x[0][0]], songDict[x[0][1]], x[0][2]))
test = testData.map(lambda p: (p[0], p[1]))

In [20]:
print test.count()

199999


In [8]:
# Training ALS Model
latentFactors = 40
numIterations = 25
regParameter = 0.1

model = ALS.trainImplicit(train, latentFactors, numIterations, regParameter)
predictions = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = testData.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
RMSE = ratesAndPreds.map(lambda r: math.sqrt((r[1][0] - r[1][1])**2)).mean()
print("No. of latent factors: " + str(latentFactors) + "\t Regulation Parameter: " + str(regParameter))
print("RMSE = " + str(RMSE))

NameError: name 'tesr' is not defined

In [9]:
predictions = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = testData.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
RMSE = ratesAndPreds.map(lambda r: math.sqrt((r[1][0] - r[1][1])**2)).mean()
print("No. of latent factors: " + str(latentFactors) + "\t Regulation Parameter: " + str(regParameter))
print("RMSE = " + str(RMSE))

No. of latent factors: 40	 Regulation Parameter: 0.1
RMSE = 0.0


# Song Recommendation

In [14]:
# Loading Best 10M_ALS Model

#model = MatrixFactorizationModel.load(sc, "./MSD_ALS")
latentFactors = 40
numIterations = 25
regParameter = 0.1

model = ALS.trainImplicit(train, latentFactors, numIterations, regParameter)
predictions = model.predictAll(validation).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = validationData.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
RMSE = ratesAndPreds.map(lambda r: math.sqrt((r[1][0] - r[1][1])**2)).mean()
print("No. of latent factors: " + str(latentFactors) + "\t Regulation Parameter: " + str(regParameter))
print("RMSE = " + str(RMSE))
print "\n"

No. of latent factors: 10	 Regulation Parameter: 0.1
RMSE = 1.80740423592




In [10]:
testUser = train.first()[0]

In [11]:
traindict = train.map(lambda data: (data[0], [data[1]])).reduceByKey(lambda a,b: a+b).collectAsMap()

In [12]:
# Create recommended songs that user have not heard before

topSongs = model.recommendProducts(testUser, 100)
filteredSongs = []
for row in topSongs:
    if not row[1] in traindict[row[0]]:
        filteredSongs.append(row)


In [13]:
for row in filteredSongs:
    print [key for key, value in songDict.iteritems() if value == row[1]]

['SOYIJIL12A6701F1C1']
['SOMYECL12A6701D9C8']
['SOJSTYO12A8C13F200']
['SOSPXWA12AB0181875']
['SOERYLG12A6701F07F']
['SOAYTRA12A8C136D0E']
['SOXDQPZ12A8C13F4FC']
['SOJJKTR12A6701F083']
['SOUMOMJ12A6701DFDC']
['SOOABBO12A6701DFDA']
['SODBMRI12A8151AF45']
['SOCZTMT12AF72A078E']
['SOOGZYY12A6701D9CB']
['SOKLVUI12A6701BF1B']
['SOENRRU12A6701BF1A']
['SOGCDYR12AC961854A']
['SOKUTUM12A6701D9CD']
['SOYDOZE12A6701FC22']
['SODJKMC12A8C137EC0']
['SOHGBHN12A6701F082']
['SOOQPIK12A6701F1C5']
['SOBNTFK12A6701F1CF']
['SODRJZO12AC4684FF6']
['SONCOJJ12A6701FC24']
['SOQLVIT12A8C137EA2']
['SOVGLTY12AF72A39CD']
['SOSRERB12A8C139735']
['SOJHVSF12A6701F084']
['SOKHHXJ12AF72A5325']
['SOHTSKK12A6701F07C']
['SOKMXEQ12A6D4F6AA8']
['SOISXVJ12A6701F1CD']
['SOBZCUC12A58A7D9AD']
['SOIUITF12A58A7D86C']
['SOAJNYK12AF729F33B']
['SOLLBAK12A6D4F6AA7']
['SOQPQWL12A58A7B964']
['SOBLIPF12AF729F53E']
['SOPBTDA12A58A7B7C3']
['SOVPAJA12A58A77B15']
['SOJJDYI12A6701FC23']
['SOPZHFK12A8C135493']
['SOUDQDW12AF729F367']
['SOJNHAY12

In [None]:
# Print out filtered Movies as string (names)
#for i in range(5):
#    print song.Song(filteredSongs[i])