In [15]:
import pyspark
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from settings import *


# Model Evaluation

In [51]:
# Sorting data by time
#RATINGS_10M = './ml-10M100K/ratings.dat'
rdd =  sc.textFile(RATINGS_10M).map(lambda line: [float(x) for x in line.split('::')]).sortBy(lambda x: x[3],False)

In [5]:
size = rdd.count()

In [6]:
# Processing data to structure: Rating(user=62510, product=34148, rating=3.0)
train = rdd.zipWithIndex().filter(lambda x: x[-1] < size*0.6).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))

In [7]:
print train.first()

Rating(user=62510, product=34148, rating=3.0)


In [9]:
# Processing validation data
testEval = rdd.zipWithIndex().filter(lambda x: x[-1] > size*0.6 and x[-1] < size*0.8).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))
test = testEval.map(lambda p: (p[0], p[1]))

In [12]:
# Training ALS Model
latentFactors = [10,20,30,40,50]
numIterations = 25
refParameter = [0.01,0.1,1.0,10.0]

for i in range(5):
    for j in range(4):
        model = ALS.train(train, latentFactors[i], numIterations, regParameter[j])
        predictions = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))
        ratesAndPreds = testEval.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        print("No. of latent factors: " + str(latentFactors[i]) + "\t Regulation Parameter: " + str(refParameter[j]))
        print("RMSE = " + str(MSE))
        print "\n"
    

No. of latent factors: 10	 Regulation Parameter: 0.01
RMSE = 0.994771720874


No. of latent factors: 10	 Regulation Parameter: 0.1
RMSE = 0.832598364153


No. of latent factors: 10	 Regulation Parameter: 1.0
RMSE = 1.97907550365


No. of latent factors: 10	 Regulation Parameter: 10.0
RMSE = 13.644303884


No. of latent factors: 20	 Regulation Parameter: 0.01
RMSE = 1.06994780361


No. of latent factors: 20	 Regulation Parameter: 0.1
RMSE = 0.834236910709


No. of latent factors: 20	 Regulation Parameter: 1.0
RMSE = 1.97907550525


No. of latent factors: 20	 Regulation Parameter: 10.0
RMSE = 13.644303884


No. of latent factors: 30	 Regulation Parameter: 0.01
RMSE = 1.1039384923


No. of latent factors: 30	 Regulation Parameter: 0.1
RMSE = 0.834441417826


No. of latent factors: 30	 Regulation Parameter: 1.0
RMSE = 1.97907550497


No. of latent factors: 30	 Regulation Parameter: 10.0
RMSE = 13.644303884


No. of latent factors: 40	 Regulation Parameter: 0.01
RMSE = 1.13017770616


No. o

# Test Data

In [41]:
# Processing test data
testEval = rdd.zipWithIndex().filter(lambda x: x[-1] > size*0.8).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))
test = testEval.map(lambda p: (p[0], p[1]))

In [42]:
# Training ALS Model
latentFactors = 10
numIterations = 25
refParameter = 0.1

model = ALS.train(train, latentFactors, numIterations, refParameter)
predictions = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = testEval.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("No. of latent factors: " + str(latentFactors[i]) + "\t Regulation Parameter: " + str(refParameter[j]))
print("RMSE = " + str(MSE))

((54040, 912), 4.367416929660967)
((31630, 1412), (4.0, 4.163697460854653))


# Recommendation System

In [16]:
# Training Best Model

latentFactors = 10
numIterations = 25
refParameter = 0.1

model = ALS.train(train, latentFactors, numIterations, refParameter)

In [14]:
testUser = train.first()[0]

62510


In [33]:
traindict = train.map(lambda data: (data[0], [data[1]])).reduceByKey(lambda a,b: a+b).collectAsMap()

{3: [33750,
  5527,
  5299,
  3408,
  7155,
  4535,
  27821,
  1564,
  8533,
  4677,
  3684,
  8783,
  5505,
  213,
  6287,
  8529,
  110,
  590,
  6377,
  6539,
  1597,
  1276,
  4995,
  1674,
  7153,
  1408,
  1552,
  1148,
  1252,
  1288,
  151,
  5952,
  1246],
 6: [1573,
  2628,
  1396,
  1527,
  1580,
  1748,
  32,
  1584,
  1653,
  3863,
  3986,
  3994,
  4446,
  4053,
  4270,
  3555,
  3623,
  4161,
  4299,
  3753,
  3755,
  349,
  1264,
  3578,
  2571,
  4369,
  3740,
  457,
  1197,
  1277,
  1304,
  2028,
  260,
  1198,
  3996,
  1193,
  1483,
  2396,
  858,
  1629,
  2405,
  1196],
 7: [5500,
  2791,
  1748,
  6273,
  5505,
  1895,
  1348,
  1245,
  2936,
  954,
  1256,
  951,
  1260,
  2804,
  1276,
  1207,
  3006,
  908,
  1254,
  1148,
  1244,
  101,
  1283,
  3467,
  3730,
  923,
  4306,
  4886,
  4975,
  1517,
  2683,
  5481,
  1240,
  3859,
  5528,
  3334,
  1234,
  2391,
  50,
  608,
  1590,
  1732,
  4206,
  1805,
  3044,
  3176,
  800,
  1086,
  2206,
  903,
  904,


In [35]:
topMovies = model.recommendProducts(testUser, 100)
filteredMovies = []
for row in topMovies:
    if not row[1] in traindict[row[0]]:
        filteredMovies.append(row)
#filteredMovies = topMovies.filter(lambda x: not x[1] in traindict[x[0]])


In [36]:
for i in range(5):
    print filteredMovies[i]


Rating(user=62510, product=42783, rating=5.068078596544477)
Rating(user=62510, product=33264, rating=4.841301731740355)
Rating(user=62510, product=32657, rating=4.788054686800457)
Rating(user=62510, product=4454, rating=4.722393769511833)
Rating(user=62510, product=61742, rating=4.721271023162554)


# Model Evaluation Using 22M

In [9]:
RATINGS_22M = './ml-latest/ratings.csv'

rdd =  sc.textFile(RATINGS_22M)
header = rdd.first()
rdd = rdd.filter(lambda line: line != header)
rdd = rdd.map(lambda line: [float(x) for x in line.split(',')]).sortBy(lambda x: x[3],False)

In [11]:
size = rdd.count()

In [12]:
train = rdd.zipWithIndex().filter(lambda x: x[-1] < size*0.6).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))

In [13]:
# Processing test data
testEval = rdd.zipWithIndex().filter(lambda x: x[-1] > size*0.8).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))
test = testEval.map(lambda p: (p[0], p[1]))

In [None]:
# Training ALS Model
latentFactors = 10
numIterations = 25
refParameter = 0.1

model = ALS.train(train, latentFactors, numIterations, refParameter)
predictions = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = testEval.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("No. of latent factors: " + str(latentFactors[i]) + "\t Regulation Parameter: " + str(refParameter[j]))
print("RMSE = " + str(MSE))

# Recommendation System - 22M Data

In [None]:
testUser = train.first()[0]

In [None]:
traindict = train.map(lambda data: (data[0], [data[1]])).reduceByKey(lambda a,b: a+b).collectAsMap()

In [None]:
topMovies = model.recommendProducts(testUser, 100)
filteredMovies = []
for row in topMovies:
    if not row[1] in traindict[row[0]]:
        filteredMovies.append(row)
#filteredMovies = topMovies.filter(lambda x: not x[1] in traindict[x[0]])


In [None]:
for i in range(5):
    print filteredMovies[i]