# ALS on Movielens
- https://developers.google.com/machine-learning/recommendation/collaborative/matrix

## Init Spark 

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("ALS") \
    .getOrCreate()

## Reading Ratings and Movies

In [None]:
ratings = spark.read.parquet("/user/hive/warehouse/movielens_parquet.db/ratings/").repartition(15)

In [None]:
movies = spark.read.parquet("/user/hive/warehouse/movielens_parquet.db/movies/").repartition(15)

In [None]:
ratings.show(3)

In [None]:
movies.show()

## Joining Ratings and Movies 

In [None]:
mr = movies.join(ratings, on="movieid").cache()

In [None]:
mr.count()

In [None]:
mr.show(2)

## Split into Train/Test 

In [None]:
(training, test) = mr.randomSplit([0.8, 0.2])

In [None]:
training.count()

In [None]:
test.count()

## ALS
- https://spark.apache.org/docs/latest/ml-collaborative-filtering.html

In [None]:
from pyspark.ml.recommendation import ALS

In [None]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=10, regParam=0.01, userCol="userid", itemCol="movieid", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

In [None]:
# Evaluate the model by computing the RMSE on the test data

from pyspark.ml.evaluation import RegressionEvaluator

predictions = model.transform(test)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
predictions.show(20)

In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)

In [None]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [None]:
# Generate top 10 movie recommendations for a specified set of users
# users = ratings.select(als.getUserCol()).distinct().limit(3)
# userSubsetRecs = model.recommendForUserSubset(users, 10)

In [None]:
# Generate top 10 user recommendations for a specified set of movies
# movies = ratings.select(als.getItemCol()).distinct().limit(3)
# movieSubSetRecs = model.recommendForItemSubset(movies, 10)

## Own Movie Recommender

In [None]:
my_ratings = spark.createDataFrame([
    (999999, 589, 5),
    (999999, 4011, 5),
    (999999, 63992, 1),
    (999999, 59315, 4),
    (999999, 2571, 5),
    (999999, 6365, 4),
]).toDF("userid", "movieid", "rating")

In [None]:
my_ratings.show()

In [None]:
training_2 = training.select("userid", "movieid", "rating").union(my_ratings)

In [None]:
training_2.where("userid = 999999").show()

In [None]:
model_2 = als.fit(training_2)

In [None]:
my_user = training_2.select("userid").where("userid = 999999")

In [None]:
userSubsetRecs = model_2.recommendForUserSubset(my_user, 10)

In [None]:
userSubsetRecs.show()

In [None]:
from pyspark.sql.functions import col

recommendations = userSubsetRecs \
.selectExpr("explode(recommendations)") \
.select(col("col.movieid")) \
.cache()

In [None]:
recommendations.show(10, False)

In [None]:
movies.join(recommendations, on="movieid").show(10, False)

## stopping the context

In [None]:
spark.stop()