In [1]:
# Import Spark library for python
import findspark
findspark.init("/usr/local/spark")
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark SQL basic example").getOrCreate()

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.sql import Row

In [4]:
df2 = spark.read.csv("dataset/ratings.csv", header=True, inferSchema=True).limit(1000000)

In [None]:
df2.show()


In [12]:
(training, test) = df2.randomSplit([0.8, 0.2])

In [13]:
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True)
model = als.fit(training)

In [15]:
param_grid = ParamGridBuilder().addGrid(als.rank, [12,13,14]).addGrid(als.maxIter,[18,19,20]).addGrid(als.regParam,[.17,.18,.19]).build()

In [16]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")


In [17]:
tvs = TrainValidationSplit(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator)

In [18]:
model=tvs.fit(training)

In [19]:
best_model = model.bestModel

In [21]:
predictions = best_model.transform(test)
rmse=evaluator.evaluate(predictions)

In [8]:
userRecs = model.recommendForAllUsers(10)

In [22]:
print("RMSE = "+str(rmse))
print("Best Model --")
print("Rank : ", best_model.rank)
print("MaxIter : ", best_model._java_obj.parent().getMaxIter())
print("RegParam : ", best_model._java_obj.parent().getRegParam())

RMSE = 0.8484780282557481
Best Model --
Rank :  12
MaxIter :  20
RegParam :  0.17


In [9]:
userRecs.show(truncate=False)


+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                                               |
+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1580  |[[118202, 3.8900042], [27246, 3.7186809], [47525, 3.634705], [3913, 3.5426269], [133333, 3.5249383], [4013, 3.4884882], [162824, 3.4520178], [71322, 3.4090197], [31856, 3.4090197], [56869, 3.4048421]]      |
|4900  |[[171603, 6.048735], [158611, 5.9433565], [72334, 5.9433565], [158603, 5.9433565], [158607, 5.9433565], [106212, 5.9433565], [14

In [None]:
display(predictions.sort("userId","rating"))