In [15]:
import pyspark
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from settings import *
import math

# Read data, Split to train test

In [5]:
# Sorting data by time
rdd =  sc.textFile(RATINGS_10M).map(lambda line: [float(x) for x in line.split('::')]).sortBy(lambda x: x[3],False)
size = rdd.count()

In [6]:
# Processing data to structure: Rating(user=62510, product=34148, rating=3.0)
train = rdd.zipWithIndex().filter(lambda x: x[-1] < size*0.6).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))
testdata = rdd.zipWithIndex().filter(lambda x: x[-1] > size*0.6).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))

# Compute user bias

In [7]:
score_mean = train.map(lambda data: data[2]).mean()
user_bias = train.map(lambda data: (data[0], data[2] - score_mean)).groupByKey().map(lambda data: (data[0], sum(data[1])/len(data[1]))).collectAsMap()

# Compute movie bias

In [8]:
movie_bias = train.map(lambda data: (data[1], data[2] - score_mean - user_bias[data[0]])).groupByKey().map(lambda data: (data[0], sum(data[1])/len(data[1]))).collectAsMap()

In [9]:
train_rm_user = train.map(lambda rating: Rating(rating.user, rating.product, rating.rating - user_bias[rating.user]))
train_rm_movie = train.map(lambda rating: Rating(rating.user, rating.product, rating.rating - user_bias[rating.user] - movie_bias[rating.product]))


In [10]:
#print movie_count
#print movie_score_bias_sum
#print movie_bias
print train.first()
print train_rm_user.first()
print train_rm_movie.first()

Rating(user=62510, product=34148, rating=3.0)
Rating(user=62510, product=34148, rating=2.5208408027796696)
Rating(user=62510, product=34148, rating=2.289214762263664)


# Model Evaluation - removed user bias

In [11]:
test = testdata.map(lambda p: (p[0], p[1]))

In [12]:
# Training ALS Model
latentFactors = 10
numIterations = 25
regParameter = 0.1

model = ALS.train(train_rm_user, latentFactors, numIterations, regParameter)

In [13]:
predictions = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = testdata.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)

In [16]:
RMSE = ratesAndPreds.map(lambda r: math.sqrt((r[1][0] - r[1][1])**2)).mean()
print("Root Mean Squared Error = " + str(RMSE))

Root Mean Squared Error = 0.782490357984


# Recommendation System

In [21]:
# Trying recommendation System 

testUsers = train.first()[0]

In [23]:
traindict = train.map(lambda data: (data[0], [data[1]])).reduceByKey(lambda a,b: a+b).collectAsMap()

In [25]:
# Creat recommended movies that user have not watched before

topMovies = model.recommendProducts(testUsers, 100)
filteredMovies = []
for row in topMovies:
    if not row[1] in traindict[row[0]]:
        filteredMovies.append(row)


Py4JError: An error occurred while calling o217.recommendProducts. Trace:
py4j.Py4JException: Method recommendProducts([class org.apache.spark.api.java.JavaRDD, class java.lang.Integer]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:335)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:344)
	at py4j.Gateway.invoke(Gateway.java:252)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)



In [None]:
# Print out filtered Movies as string (names)
movies10M_dict =  sc.textFile(MOVIES_10M).map(lambda line: line.split('::')).map(lambda x: (float(x[0]), x[1])).collectAsMap()

for i in range(5):
    print movies10M_dict[filteredMovies[i][1]]