In [1]:
import pyspark
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.mllib.clustering import KMeans, KMeansModel
from settings import *

# Read data, Split to train test

In [2]:
# Sorting data by time
rdd =  sc.textFile(RATINGS_10M).map(lambda line: [float(x) for x in line.split('::')]).sortBy(lambda x: x[3],False)
size = rdd.count()

In [3]:
# Processing data to structure: Rating(user=62510, product=34148, rating=3.0)
train = rdd.zipWithIndex().filter(lambda x: x[-1] < size*0.6).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))
testdata = rdd.zipWithIndex().filter(lambda x: x[-1] > size*0.6).map(lambda x: Rating(int(x[0][0]), int(x[0][1]), x[0][2]))
print "read data finished"

read data finished


# KMeans

In [8]:
user_train = rdd.zipWithIndex().filter(lambda x: x[-1] < size*0.6).map(lambda x: (x[0][0], float(x[0][2]))).groupByKey()
product_train = rdd.zipWithIndex().filter(lambda x: x[-1] < size*0.6).map(lambda x: (x[0][1], float(x[0][2]))).groupByKey()
print "train user"
user_clusters = KMeans.train(user_train, int(len(user_bias)/2), maxIterations=20, initializationMode="random")
print "train cluster"
product_clusters = KMeans.train(product_train, int(len(movie_bias)/2), maxIterations=20, initializationMode="random")

print "train finished"

train user
train cluster
train finished


In [None]:
def convert(data):
    user = predict((data[0], float(data[2])), user_clusters)
    product = predict((data[1], float(user[1])), product_clusters)
    return "::".join( (str(user[0]), str(product[0]), str(product[1])) )
def predict(point, clusters):
    category = clusters.predict(point)
    center = clusters.centers[category]
    #print category
    return (category, center[1])
train_reduce = rdd.zipWithIndex().filter(lambda x: x[-1] < size*0.6)\
        .map(lambda x: (x[0][0], x[0][1], x[0][2])).map(convert)#.map(lambda x: Rating(int(x[0]), int(x[1]), x[2]))
validate_reduce = rdd.zipWithIndex().filter(lambda x: size*0.6<=x[-1] < size*0.8)\
        .map(lambda x: (x[0][0], x[0][1], x[0][2])).map(convert)#.map(lambda x: Rating(int(x[0]), int(x[1]), x[2]))
test_reduce = rdd.zipWithIndex().filter(lambda x: size*0.6<=x[-1] < size*0.8)\
        .map(lambda x: (x[0][0], x[0][1], x[0][2])).map(convert)#.map(lambda x: Rating(int(x[0]), int(x[1]), x[2]))

In [13]:
print "save model"
user_clusters.save(sc, "user_Model")
product_clusters.save(sc, "product_Model")

save model


In [None]:
print "save data"
print "save train"
train_reduce.cache()
train_reduce.saveAsTextFile("train")
'''
train_reduce_list = train_reduce.collect()
with open(DATA_FOLDER + "train.dat", "wb") as f:
    for rating in train_reduce_list:
        f.write(str(rating.user) + "::" + str(rating.product) + "::" + str(rating.rating) + "\n")
'''
print "save train finished"      

save data
save train


In [None]:
print "save validate"
validate_reduce.cache()
validate_reduce_list = validate_reduce.collect()
with open(DATA_FOLDER + "validate.dat", "wb") as f:
    for rating in validate_reduce:
        f.write(str(rating.user) + "::" + str(rating.product) + "::" + str(rating.rating) + "\n")
print "save validate finished"        

In [None]:
print "save test"
test_reduce.cache()
test_reduce = test_reduce.collect()
with open(DATA_FOLDER + "test.dat", "wb") as f:
    for rating in test_reduce:
        f.write(str(rating.user) + "::" + str(rating.product) + "::" + str(rating.rating) + "\n")
print "save test finished"

In [None]:
print train_reduce.first()