# ALS on Movielens
- https://developers.google.com/machine-learning/recommendation/collaborative/matrix

## Init Spark 

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("ALS") \
    .getOrCreate()

## Reading Ratings and Movies

In [24]:
ratings = spark.read.parquet("/user/hive/warehouse/movielens_parquet.db/ratings/").repartition(15)

In [25]:
movies = spark.read.parquet("/user/hive/warehouse/movielens_parquet.db/movies/").repartition(15)

In [26]:
ratings.show(3)



+------+-------+------+----------+
|userid|movieid|rating| timestamp|
+------+-------+------+----------+
|  2469|    485|     3|1006233627|
|  7192|   4816|     4|1268986317|
|  6770|    282|     4| 868156873|
+------+-------+------+----------+
only showing top 3 rows



                                                                                

In [27]:
movies.show()

+-------+--------------------+----+--------------------+
|movieid|               title|year|              genres|
+-------+--------------------+----+--------------------+
| 167020|       A Fuller Life|2013|                  []|
| 191305|        Rudhramadevi|2015|     [Action, Drama]|
| 101489|Assassin's Bullet...|2012|[Action, Drama, T...|
|   8399|  We're Not Married!|1952|   [Comedy, Romance]|
| 162270|   Road of No Return|2008|[Action, Drama, T...|
|  98253|Samurai Banners (...|1969|[Action, Adventur...|
|   6560|       Loose Cannons|1990|    [Action, Comedy]|
| 177079| The Little Princess|1997|   [Children, Drama]|
| 184997|         Love, Simon|2018|     [Comedy, Drama]|
| 105805|Episode 3: Enjoy ...|2009|  [Documentary, War]|
| 183675|       Second Nature|2016|            [Comedy]|
|   5528|      One Hour Photo|2002|   [Drama, Thriller]|
| 133149|Barbie in A Merma...|2012|[Animation, Child...|
|   8545|Plain Dirty (a.k....|2003|    [Drama, Romance]|
|  30707| Million Dollar Baby|2

## Joining Ratings and Movies 

In [28]:
mr = movies.join(ratings, on="movieid").cache()

22/04/20 16:27:10 WARN CacheManager: Asked to cache already cached data.


In [29]:
mr.count()

25000095

In [30]:
mr.show(2)

+-------+--------------------+----+--------------------+------+------+----------+
|movieid|               title|year|              genres|userid|rating| timestamp|
+-------+--------------------+----+--------------------+------+------+----------+
|  26485|         Rumble Fish|1983|             [Drama]|147752|     4|1174780713|
|   4993|Lord of the Rings...|2001|[Adventure, Fantasy]|136291|     5|1540570227|
+-------+--------------------+----+--------------------+------+------+----------+
only showing top 2 rows



## Split into Train/Test 

In [31]:
(training, test) = mr.randomSplit([0.8, 0.2])

In [32]:
training.count()

                                                                                

20002148

In [33]:
test.count()

                                                                                

4997947

## ALS
- https://spark.apache.org/docs/latest/ml-collaborative-filtering.html

In [34]:
from pyspark.ml.recommendation import ALS

In [36]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=10, regParam=0.01, userCol="userid", itemCol="movieid", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

                                                                                

In [37]:
# Evaluate the model by computing the RMSE on the test data

from pyspark.ml.evaluation import RegressionEvaluator

predictions = model.transform(test)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))



Root-mean-square error = 0.8448234928775471


                                                                                

In [38]:
predictions.show(20)

                                                                                

+-------+--------------------+----+--------------------+------+------+----------+----------+
|movieid|               title|year|              genres|userid|rating| timestamp|prediction|
+-------+--------------------+----+--------------------+------+------+----------+----------+
|    480|       Jurassic Park|1993|[Action, Adventur...|    31|     2|1256229392| 2.2547963|
|    588|             Aladdin|1992|[Adventure, Anima...|    31|     2|1256229408| 2.3385558|
|    647|  Courage Under Fire|1996|[Action, Crime, D...|    31|     2|1256225436| 2.2291985|
|   1917|          Armageddon|1998|[Action, Romance,...|    31|     2|1256226586| 1.6885306|
|   2791|           Airplane!|1980|            [Comedy]|    31|     3|1256226140| 2.3572118|
|   2797|                 Big|1988|[Comedy, Drama, F...|    31|     3|1256230075|  2.637747|
|   3206|    Against All Odds|1984|           [Romance]|    31|     2|1256226098|  1.833347|
|   8120|         29th Street|1991|     [Comedy, Drama]|    31|     2|

In [39]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)

In [40]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [41]:
# Generate top 10 movie recommendations for a specified set of users
# users = ratings.select(als.getUserCol()).distinct().limit(3)
# userSubsetRecs = model.recommendForUserSubset(users, 10)

In [42]:
# Generate top 10 user recommendations for a specified set of movies
# movies = ratings.select(als.getItemCol()).distinct().limit(3)
# movieSubSetRecs = model.recommendForItemSubset(movies, 10)

## Own Movie Recommender

In [164]:
my_ratings = spark.createDataFrame([
    (999999, 589, 5),
    (999999, 4011, 5),
    (999999, 63992, 1),
    (999999, 59315, 4),
    (999999, 2571, 5),
    (999999, 6365, 4),
]).toDF("userid", "movieid", "rating")

In [165]:
my_ratings.show()

+------+-------+------+
|userid|movieid|rating|
+------+-------+------+
|999999|    589|     5|
|999999|   4011|     5|
|999999|  63992|     1|
|999999|  59315|     4|
|999999|   2571|     5|
|999999|   6365|     4|
+------+-------+------+



In [166]:
training_2 = training.select("userid", "movieid", "rating").union(my_ratings)

In [167]:
training_2.where("userid = 999999").show()

                                                                                

+------+-------+------+
|userid|movieid|rating|
+------+-------+------+
|999999|    589|     5|
|999999|   4011|     5|
|999999|  63992|     1|
|999999|  59315|     4|
|999999|   2571|     5|
|999999|   6365|     4|
+------+-------+------+



In [168]:
model_2 = als.fit(training_2)

                                                                                

In [169]:
my_user = training_2.select("userid").where("userid = 999999")

In [170]:
userSubsetRecs = model_2.recommendForUserSubset(my_user, 10)



In [171]:
userSubsetRecs.show()



+------+--------------------+
|userid|     recommendations|
+------+--------------------+
|999999|[{5271, 14.730714...|
+------+--------------------+



                                                                                

In [172]:
from pyspark.sql.functions import col

recommendations = userSubsetRecs \
.selectExpr("explode(recommendations)") \
.select(col("col.movieid")) \
.cache()

In [173]:
recommendations.show(10, False)

                                                                                

+-------+
|movieid|
+-------+
|5271   |
|117450 |
|160519 |
|113692 |
|187753 |
|172165 |
|178049 |
|132486 |
|173393 |
|167106 |
+-------+



In [174]:
movies.join(recommendations, on="movieid").show(10, False)

+-------+---------------------------------------+----+-------------------------+
|movieid|title                                  |year|genres                   |
+-------+---------------------------------------+----+-------------------------+
|187753 |Pope Francis: A Man of His Word        |2018|[Documentary]            |
|113692 |Happy New Year (La Bonne Année)        |1973|[Comedy, Crime, Drama]   |
|160519 |Curious George: A Very Monkey Christmas|2009|[Animation, Children]    |
|5271   |30 Years to Life                       |2001|[Comedy, Drama, Romance] |
|178049 |Happy-Go-Lucky                         |1972|[Comedy, Drama]          |
|117450 |Exit                                   |2011|[Drama]                  |
|172165 |Mister Designer                        |1988|[Drama, Fantasy, Mystery]|
|132486 |The Liberator                          |2013|[Drama]                  |
|173393 |Three Sisters                          |1970|[]                       |
|167106 |Breaking a Monster 

## stopping the context

In [None]:
spark.stop()