In [1]:
import pyspark as ps
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.sql import DataFrameNaFunctions as DFna
from pyspark.sql.functions import udf, col, when
import matplotlib.pyplot as plt
import numpy as np
import math
import os

In [3]:
spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("building recommender") \
            .getOrCreate()
            
sc = spark.sparkContext

In [5]:
subset_raw_data = sc.textFile('data/subset.csv')

In [6]:
print type(subset_raw_data)
subset_raw_data.take(3)

<class 'pyspark.rdd.RDD'>


[u',id,source,reader_id,from_book_id,book_id,ad_id,boost_id,clicked,claimed,optin,created_at,updated_at',
 u'29295882,29295919,success_page,186643,20370,19203,0,0,1,1,0,2017-02-20 02:16:11,2017-02-20 02:26:24',
 u'29295883,29295920,success_page,186643,20370,19813,0,0,0,0,0,2017-02-20 02:16:11,2017-02-20 02:16:11']

In [14]:
subset_raw_data_header = subset_raw_data.take(1)[0]
subset_data = subset_raw_data.filter(lambda line: line!=subset_raw_data_header)\
    .map(lambda line: line.split(","))\
    .map(lambda tokens: (tokens[3],tokens[5],tokens[9])).cache()

In [16]:
subsetRDD = subset_data.map(lambda p: Row(reader_id=int(p[0]), book_id=int(p[1]), claimed=float(p[2])))

In [15]:
subset_data.take(4)

[(u'186643', u'19203', u'1'),
 (u'186643', u'19813', u'0'),
 (u'523754', u'16281', u'0'),
 (u'523754', u'22669', u'1')]

In [17]:
subsetRDD.take(4)

[Row(book_id=19203, claimed=1.0, reader_id=186643),
 Row(book_id=19813, claimed=0.0, reader_id=186643),
 Row(book_id=16281, claimed=0.0, reader_id=523754),
 Row(book_id=22669, claimed=1.0, reader_id=523754)]

In [26]:
ratings = spark.createDataFrame(subsetRDD)
# (training, test) = ratings.randomSplit([0.8, 0.2])
training, validation, test = ratings.randomSplit([0.6, 0.2, 0.2], seed=0L)
# validation_for_predict = validation.map(lambda x: (x[0], x[1]))
# test_for_predict = test.map(lambda x: (x[0], x[1]))

In [28]:
iterations = 5
regularization_parameter = 0.1
ranks = [10, 20, 30, 40]
errors = []

min_error = float('inf')
best_rank = -1
best_iteration = -1

for rank in ranks:
    als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=rank,\
              userCol="reader_id", itemCol="book_id", ratingCol="claimed")#, implicitPrefs=True)
    model = als.fit(training)
    predictions = model.transform(validation)
    new_predictions = predictions.filter(col('prediction') != np.nan)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="claimed",\
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(new_predictions)
    errors.append(rmse)

    print 'For rank %s the RMSE is %s' % (rank, rmse)
    if rmse < min_error:
        min_error = rmse
        best_rank = rank
print 'The best model was trained with rank %s' % best_rank

For rank 10 the RMSE is 0.303873846647
For rank 20 the RMSE is 0.300888997867
For rank 30 the RMSE is 0.300137237713
For rank 40 the RMSE is 0.300607483298
The best model was trained with rank 30


In [37]:
predictions.take(200)

[Row(book_id=3918, claimed=0.0, reader_id=158169, prediction=0.0),
 Row(book_id=3918, claimed=0.0, reader_id=541408, prediction=0.0),
 Row(book_id=3918, claimed=0.0, reader_id=394665, prediction=0.018479039892554283),
 Row(book_id=3918, claimed=0.0, reader_id=254931, prediction=0.0),
 Row(book_id=3918, claimed=0.0, reader_id=106401, prediction=0.014930207282304764),
 Row(book_id=3918, claimed=0.0, reader_id=417414, prediction=0.030561737716197968),
 Row(book_id=3918, claimed=0.0, reader_id=295346, prediction=0.0066400980576872826),
 Row(book_id=3918, claimed=0.0, reader_id=523242, prediction=0.21499432623386383),
 Row(book_id=4900, claimed=0.0, reader_id=498360, prediction=0.0),
 Row(book_id=4900, claimed=0.0, reader_id=517548, prediction=0.0),
 Row(book_id=4900, claimed=0.0, reader_id=298142, prediction=0.0),
 Row(book_id=4900, claimed=0.0, reader_id=568081, prediction=-0.009341178461909294),
 Row(book_id=4900, claimed=0.0, reader_id=217096, prediction=0.0),
 Row(book_id=4900, claimed

In [36]:
new_predictions.count()

58160