## Collaborative filtering

* #### Khoi tao

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "8g") \
    .appName('movieRecommendationPySpark') \
    .getOrCreate()

sc = spark.sparkContext
sc.setCheckpointDir('checkpoint')

* #### Load du lieu

In [2]:
ratings = (
    spark.read.csv(
        path = "../data/ml-25m/ratings.csv",
        sep=",", header=True,quote='"',schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
    ).select("userId", "movieId", "rating")
    .cache()
)

In [3]:
ratings.show(10,False)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|1     |296    |5.0   |
|1     |306    |3.5   |
|1     |307    |5.0   |
|1     |665    |5.0   |
|1     |899    |3.5   |
|1     |1088   |4.0   |
|1     |1175   |3.5   |
|1     |1217   |3.5   |
|1     |1237   |5.0   |
|1     |1250   |4.0   |
+------+-------+------+
only showing top 10 rows



In [4]:
ratings.summary().show()

+-------+-----------------+------------------+------------------+
|summary|           userId|           movieId|            rating|
+-------+-----------------+------------------+------------------+
|  count|         25000095|          25000095|          25000095|
|   mean|81189.28115381162|21387.981943268616| 3.533854451353085|
| stddev|46791.71589745555| 39198.86210105983|1.0607439611423475|
|    min|                1|                 1|               0.5|
|    25%|            40510|              1197|               3.0|
|    50%|            80906|              2947|               3.5|
|    75%|           121545|              8623|               4.0|
|    max|           162541|            209171|               5.0|
+-------+-----------------+------------------+------------------+



* #### Training

In [5]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

In [6]:
als = ALS(
    userCol ="userId",
    itemCol ="movieId",
    ratingCol = "rating",
)
(training_data, validation_data) = ratings.randomSplit([8.0,2.0])

evaluator = RegressionEvaluator(
    metricName="rmse",labelCol="rating",predictionCol="prediction"
)

model = als.fit(training_data)
predictions = model.transform(validation_data)

In [7]:
model.save("modelRecNormal")

In [8]:
predictions.show(10,False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|31563 |107406 |3.5   |3.4534233 |
|31563 |96966  |4.0   |4.1158876 |
|31563 |103253 |3.0   |2.9561546 |
|31563 |106452 |3.0   |3.9261878 |
|31563 |96821  |2.0   |3.7020555 |
|31563 |108727 |4.5   |3.698239  |
|31563 |105246 |4.0   |3.5068374 |
|31563 |104923 |3.5   |3.865512  |
|31563 |107771 |4.0   |3.8892705 |
|31563 |99145  |3.5   |3.3802044 |
+------+-------+------+----------+
only showing top 10 rows



In [9]:
rmse = evaluator.evaluate(predictions.na.drop())

In [10]:
print ("*Base Model*")
print (f"RMSE: {rmse}")
print (f" Rank: {model.rank}")
print (f" MaxIter: {model._java_obj.parent().getMaxIter()}")
print (f" RegParam: {model._java_obj.parent().getRegParam()}") 

*Base Model*
RMSE: 0.8068647889211222
 Rank: 10
 MaxIter: 10
 RegParam: 0.1


In [11]:
# userFactors=model.userFactors
# itemFactors = model.itemFactors
# userFactors.sort('id').show(5,False)
# itemFactors.sort('id').show(5,False)
# import numpy as np
# user91Features = model.userFactors.filter(f.col('id')==91).select(f.col('features')).rdd.flatMap(lambda x:x).collect()[0]
# item471Features = model.itemFactors.filter(f.col('id')==471).select(f.col('features')).rdd.flatMap(lambda x:x).collect()[0]

# print(user91Features)
# print(item471Features)
# print('Predicted rating of user 91 for movie 471: ' + str(np.dot(user91Features, item471Features)))

* #### Train best model

In [12]:
# Cần máy mạnh để train
# param_grid = ParamGridBuilder()\
#     .addGrid(als.rank, [5, 40, 80, 120])\
#     .addGrid(als.maxIter, [5, 100, 250, 500])\
#     .addGrid(als.regParam, [0.05, 0.1, 1.5])\
#     .build()

# Tell Spark what values to try for each hyperparameter
param_grid = (ParamGridBuilder() \
    .addGrid(als.rank, [5, 10]) \
    .addGrid(als.maxIter, [20]) \
    .addGrid(als.regParam, [0.05,0.1]) \
    .build()
)

# Build cross validation step using CrossValidator
cv = CrossValidator(estimator=als,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=5)


In [13]:
# Run the cv on the training data
model_cv = cv.fit(training_data)
# Extract best combination of values from cross validation
best_model = model_cv.bestModel

In [14]:
best_model.save("modelRecBest")

In [15]:
predictions = best_model.transform(validation_data)
rmse = evaluator.evaluate(predictions.na.drop())

In [16]:
# Print evaluation metrics and model parameters
print ("*Best Model*")
print (f"RMSE: {rmse}")
print (f" Rank: {best_model.rank}")
print (f" MaxIter: {best_model._java_obj.parent().getMaxIter()}")
print (f" RegParam: {best_model._java_obj.parent().getRegParam()}") 

*Best Model*
RMSE: 0.8024075892997405
 Rank: 5
 MaxIter: 20
 RegParam: 0.05
