In [2]:
import pyspark
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from settings import *

# Read data, Split to train test

In [3]:
# Sorting data by time
rdd =  sc.textFile(RATINGS_10M).map(lambda line: [float(x) for x in line.split('::')]).sortBy(lambda x: x[3],False)
size = rdd.count()

In [4]:
# Processing data to structure: Rating(user=62510, product=34148, rating=3.0)
train = rdd.zipWithIndex().filter(lambda x: x[-1] < size*0.6).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))
testdata = rdd.zipWithIndex().filter(lambda x: x[-1] > size*0.6).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))

# Compute user bias

In [5]:
'''
test = sc.parallelize([(1,3), (3,4), (1,3), (1,4)])

print test.map(lambda data: (data[0], 1)).reduceByKey(lambda a,b: a+1).collectAsMap()
#Test counter
'''
user_count = train.map(lambda data: (data[0], 1)).reduceByKey(lambda a,b: a+1).collectAsMap()
score_mean = train.map(lambda data: data[2]).mean()
user_score_bias_sum = train.map(lambda data: (data[0], data[2] - score_mean)).reduceByKey(lambda a,b: a+b).collectAsMap()
user_bias = {}
for key in user_count.keys():
    user_bias[key] = user_score_bias_sum[key]/user_count[key]

# Compute movie bias

In [11]:
movie_count = train.map(lambda data: (data[1], 1)).reduceByKey(lambda a,b: a+1).collectAsMap()
movie_score_bias_sum = train.map(lambda data: (data[1], data[2] - score_mean -user_bias[data[0]])).reduceByKey(lambda a,b: a+b).collectAsMap()
movie_bias = {}
for key in movie_count.keys():
    movie_bias[key] = movie_score_bias_sum[key]/movie_count[key]

In [12]:
'''
test = sc.parallelize([(1,3), (3,4), (1,3), (1,4)])
def f(x): return (x[0], x[1]**2)
print test.map(f).collect()
'''

train_rm_user = train.map(lambda rating: Rating(rating.user, rating.product, rating.rating - user_bias[rating.user]))
train_rm_movie = train.map(lambda rating: Rating(rating.user, rating.product, rating.rating - movie_bias[rating.product]))


In [19]:
#print movie_count
#print movie_score_bias_sum
print movie_bias
print train.first()
print train_rm_user.first()
print train_rm_movie.first()

{1: 0.8482436795342417, 2: 0.32881520233289185, 3: 1.6491123459901464, 4: 7.938128360288937, 5: 0.8814431393972978, 6: 3.2239303867805083, 7: 3.7087887427599964, 8: 10.685323899011628, 9: 6.90444710630666, 10: 1.5653284329863313, 11: 3.503991196620293, 12: 10.630695734932521, 32770: 7.269350139605714, 14: 1.9223899125131456, 15: 17.864462293351487, 16: 4.331549756870528, 17: 2.290428975363458, 18: 7.336552867414475, 19: -0.9124264330729365, 20: 11.119368771155909, 21: 2.103271796096898, 22: 7.357735660774567, 23: 4.34938957776827, 24: 1.4516177140934228, 25: 1.7295762299240147, 26: 2.7442424147244, 27: 11.277810983345114, 28: 6.630173162733052, 29: 3.512483608300128, 30: 8.929069610389437, 31: 2.3732417543491793, 32: 1.872686236659143, 33: -11.115973631308407, 34: 0.7087370442568415, 35: 13.161715698121794, 36: 1.863159295154371, 37: 19.643527478183596, 38: 14.406645548394243, 39: 1.6057028458843385, 40: 9.311599592578785, 41: 4.72344235299422, 42: 21.906996400379434, 43: -1.0272734239

In [None]:
# Training ALS Model
rank = 10
numIterations = 10
model = ALS.train(train, rank, numIterations)

# Model Evaluation

In [41]:
# Trying model evaluation



In [42]:
predictions = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = testdata.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)

((54040, 912), 4.367416929660967)
((31630, 1412), (4.0, 4.163697460854653))


In [43]:
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 1.00137982702


# Recommendation System

In [None]:
# Trying recommendation System 

#testdata = train.map(lambda p: (p[0], p[1]))
testUsers = train.map(lambda p: p[0])

In [None]:
model.recommendProducts(testUsers, 100)

#predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
#ratesAndPreds = train.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)


In [None]:
#MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
#print("Mean Squared Error = " + str(MSE))