In [6]:
import pyspark as ps
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [7]:
spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("building recommender") \
            .getOrCreate()
            
sc = spark.sparkContext

In [14]:
lines = spark.read.text("data/subset.csv").rdd
header = lines.take(1)[0]
# subset_data = subset_raw_data.filter(lambda line: line!=header)
parts = lines.filter(lambda line: line!=header).map(lambda row: row.value.split(","))
subsetRDD = parts.map(lambda p: Row(reader_id=int(p[3]), book_id=int(p[5]), claimed=float(p[9])))
# subsetRDD = parts.map(lambda p: Row(reader_id=int(p[3]), book_id=int(p[5]), claimed=int(p[9])))

In [15]:
subsetRDD.take(2)

[Row(book_id=19203, claimed=1.0, reader_id=186643),
 Row(book_id=19813, claimed=0.0, reader_id=186643)]

In [16]:
ratings = spark.createDataFrame(subsetRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [17]:
als = ALS(maxIter=5, regParam=0.01, userCol="reader_id", itemCol="book_id", ratingCol="claimed")
model = als.fit(training)

In [19]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="claimed",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [20]:
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = nan


In [39]:
predictions.take(500)

[Row(book_id=3918, claimed=0.0, reader_id=254931, prediction=0.0),
 Row(book_id=3918, claimed=0.0, reader_id=417414, prediction=-0.027520660310983658),
 Row(book_id=3918, claimed=0.0, reader_id=159142, prediction=0.0),
 Row(book_id=3918, claimed=0.0, reader_id=295346, prediction=-0.06353689730167389),
 Row(book_id=3918, claimed=0.0, reader_id=114650, prediction=0.14652931690216064),
 Row(book_id=4900, claimed=0.0, reader_id=51736, prediction=0.09755256772041321),
 Row(book_id=4900, claimed=0.0, reader_id=36860, prediction=0.0),
 Row(book_id=4900, claimed=0.0, reader_id=498360, prediction=0.0),
 Row(book_id=4900, claimed=0.0, reader_id=17840, prediction=0.0),
 Row(book_id=4900, claimed=0.0, reader_id=511, prediction=-0.41067275404930115),
 Row(book_id=4900, claimed=0.0, reader_id=431523, prediction=-0.48397326469421387),
 Row(book_id=4900, claimed=0.0, reader_id=497007, prediction=0.4133593440055847),
 Row(book_id=4900, claimed=0.0, reader_id=568081, prediction=0.2572405934333801),
 Row