In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('recommender').getOrCreate()

In [3]:
movie_lens = spark.read.csv('../data/movielens_ratings.csv', inferSchema=True, header=True)
movie_lens.printSchema()
movie_lens.describe().show()
movie_lens.show(10)

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|

In [4]:
train_data, test_data = movie_lens.randomSplit([.8, .2], seed=42)

In [5]:
from pyspark.ml.recommendation import ALS

In [6]:
als = ALS(
    maxIter=5, regParam=0.01, userCol='userId',
    itemCol='movieId', ratingCol='rating',
    coldStartStrategy='drop'
)

In [7]:
als_model = als.fit(train_data)

In [8]:
predictions = als_model.transform(test_data)

In [9]:
predictions.show()

+-------+------+------+------------+
|movieId|rating|userId|  prediction|
+-------+------+------+------------+
|     31|   1.0|    27|  0.78017753|
|     31|   1.0|     5|  -0.1638715|
|     31|   1.0|    19|   0.7568228|
|     31|   3.0|    14|  0.90974927|
|     31|   1.0|     0|   1.2369385|
|     85|   3.0|     6|   2.5026789|
|     85|   4.0|     7|   2.4807098|
|     53|   1.0|    12|   4.3190637|
|     78|   1.0|    28|  0.67938143|
|     34|   1.0|    16|    1.684705|
|     34|   1.0|    15|    1.258081|
|     34|   1.0|     0|   1.5491468|
|     28|   3.0|     1|   1.0931132|
|     28|   1.0|     5|-0.102508605|
|     28|   1.0|     2|   3.0077848|
|     76|   1.0|     1|   1.4828697|
|     76|   1.0|    19|   3.2280333|
|     76|   3.0|     7|   0.6411397|
|     76|   1.0|    25|   1.6725394|
|     76|   1.0|     2|   2.8872871|
+-------+------+------+------------+
only showing top 20 rows



In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

In [11]:
evaluator = RegressionEvaluator(
    metricName='rmse', labelCol='rating', predictionCol='prediction'
)

In [12]:
rmse = evaluator.evaluate(predictions)
print(f'Root-mean-square error = {rmse}')

Root-mean-square error = 1.9788846361649453


In [13]:
# Generate top 5 movie recommendations for each user
user_recs = als_model.recommendForAllUsers(5)
# Generate top 5 user recommendations for each movie
movie_recs = als_model.recommendForAllItems(5)

In [14]:
user_recs.show(5, truncate=False)
movie_recs.show(5, truncate=False)

+------+------------------------------------------------------------------------------------+
|userId|recommendations                                                                     |
+------+------------------------------------------------------------------------------------+
|28    |[{64, 6.7887263}, {28, 5.6397724}, {17, 5.4291916}, {35, 5.180738}, {12, 5.1007643}]|
|26    |[{75, 5.350701}, {94, 5.234352}, {22, 5.112195}, {7, 5.0050864}, {77, 4.9674935}]   |
|27    |[{7, 4.195648}, {76, 3.9279258}, {18, 3.6610832}, {36, 3.2760515}, {32, 3.2433596}] |
|12    |[{76, 5.6571283}, {29, 5.3348837}, {17, 5.1697564}, {64, 4.791089}, {28, 4.734739}] |
|22    |[{39, 6.8509345}, {76, 5.9521003}, {53, 5.894269}, {28, 5.478248}, {83, 5.2109528}] |
+------+------------------------------------------------------------------------------------+
only showing top 5 rows

+-------+-----------------------------------------------------------------------------------+
|movieId|recommendations           

In [15]:
# Generate top 5 movie recommendations for a specified set of users
users = test_data.select(als.getUserCol()).distinct().limit(3)
user_subset_recs = als_model.recommendForUserSubset(users, 5)
# Generate top 5 user recommendations for a specified set of movies
movies = test_data.select(als.getItemCol()).distinct().limit(3)
movie_subset_recs = als_model.recommendForItemSubset(movies, 5)

In [16]:
user_subset_recs.show(5, truncate=False)
movie_subset_recs.show(5, truncate=False)

+------+------------------------------------------------------------------------------------+
|userId|recommendations                                                                     |
+------+------------------------------------------------------------------------------------+
|28    |[{64, 6.7887263}, {28, 5.6397724}, {17, 5.4291916}, {35, 5.180738}, {12, 5.1007643}]|
|26    |[{75, 5.350701}, {94, 5.234352}, {22, 5.112195}, {7, 5.0050864}, {77, 4.9674935}]   |
|27    |[{7, 4.195648}, {76, 3.9279258}, {18, 3.6610832}, {36, 3.2760515}, {32, 3.2433596}] |
+------+------------------------------------------------------------------------------------+

+-------+--------------------------------------------------------------------------------+
|movieId|recommendations                                                                 |
+-------+--------------------------------------------------------------------------------+
|31     |[{12, 4.078867}, {28, 3.161229}, {8, 3.0751467}, {7, 2.6423