## Collaborative filtering

* #### Khoi tao

In [5]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "6g") \
    .appName('movieRecommendationPySpark') \
    .getOrCreate()

* #### Load du lieu

In [6]:
ratings = (
    spark.read.csv(
        path = "../data/ml-25m/ratings.csv",
        sep=",", header=True,quote='"',schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
    ).select("userId", "movieId", "rating")
    .cache()
)

In [None]:
ratings.show(10,False)

In [None]:
ratings.summary().show()

* #### Training

In [7]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [8]:
als = ALS(
    userCol ="userId",
    itemCol ="movieId",
    ratingCol = "rating",
)
(training_data, validation_data) = ratings.randomSplit([8.0,2.0])

evaluator = RegressionEvaluator(
    metricName="rmse",labelCol="rating",predictionCol="prediction"
)

model = als.fit(training_data)
predictions = model.transform(validation_data)

In [9]:
model.save("modelRecNormal")

In [10]:
predictions.show(10,False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|31563 |104841 |3.5   |3.4680753 |
|31563 |96966  |4.0   |4.109942  |
|31563 |106452 |3.0   |3.9050436 |
|31563 |109487 |5.0   |3.8087015 |
|31563 |111759 |3.5   |3.5323467 |
|31563 |109374 |5.0   |4.07057   |
|31563 |103539 |3.0   |3.424638  |
|31563 |106766 |5.0   |3.7999783 |
|31563 |101525 |3.5   |3.7429621 |
|31563 |111364 |3.5   |2.6357195 |
+------+-------+------+----------+
only showing top 10 rows



In [11]:
rmse = evaluator.evaluate(predictions.na.drop())
print(rmse)

0.8046279138022526


In [None]:
# userFactors=model.userFactors
# itemFactors = model.itemFactors
# userFactors.sort('id').show(5,False)
# itemFactors.sort('id').show(5,False)
# import numpy as np
# user91Features = model.userFactors.filter(f.col('id')==91).select(f.col('features')).rdd.flatMap(lambda x:x).collect()[0]
# item471Features = model.itemFactors.filter(f.col('id')==471).select(f.col('features')).rdd.flatMap(lambda x:x).collect()[0]

# print(user91Features)
# print(item471Features)
# print('Predicted rating of user 91 for movie 471: ' + str(np.dot(user91Features, item471Features)))

* #### Train best model

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "6g") \
    .appName('movieRecommendationPySpark') \
    .getOrCreate()

sc = spark.sparkContext
sc.setCheckpointDir('checkpoint')

ratings = (
    spark.read.csv(
        path = "../data/ml-25m/ratings.csv",
        sep=",", header=True,quote='"',schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
    ).select("userId", "movieId", "rating")
    .cache()
)

als = ALS(
    userCol ="userId",
    itemCol ="movieId",
    ratingCol = "rating",
)
(training_data, validation_data) = ratings.randomSplit([8.0,2.0])

param_grid = (ParamGridBuilder() \
    .addGrid(als.rank, [5, 10]) \
    .addGrid(als.maxIter, [20]) \
    .addGrid(als.regParam, [0.05,0.1]) \
    .build()
)
# Tell Spark how to evaluate model performance
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
# Build cross validation step using CrossValidator
cv = CrossValidator(estimator=als,estimatorParamMaps=param_grid,evaluator=evaluator,numFolds=5)

In [2]:
model_cv = cv.fit(training_data)
best_model = model_cv.bestModel

In [3]:
best_model.save("modelRecBest")

In [4]:
predictions = best_model.transform(validation_data)
rmse = evaluator.evaluate(predictions.na.drop())
print(rmse)

0.8044351138766392
