In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modules.my_pyspark import *

In [3]:
spark = MyPySpark(session=True)

# 1. Đọc dữ liệu

In [4]:
data_path = r"./data/movielens_ratings.csv"

In [5]:
data = spark.readFile(data_path, 'csv')

In [6]:
data.show(5, False)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|2      |3.0   |0     |
|3      |1.0   |0     |
|5      |2.0   |0     |
|9      |4.0   |0     |
|11     |1.0   |0     |
+-------+------+------+
only showing top 5 rows



* Distinct `users` và `movies`

In [7]:
users = data.select('userId').distinct().count()
movies = data.select('movieId').distinct().count()
numerator = data.count()

In [8]:
numerator, users, movies

(1501, 30, 100)

# 2. Chuẩn hóa dữ liệu, chuyển đổi dữ liệu
# 3. Chia dữ liệu train/test

In [9]:
training, test = data.randomSplit((.8, .2))

# 4. Xây dựng model

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [11]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')
model = als.fit(training)

<hr>

In [12]:
als_1 = ALS(maxIter=10, regParam=0.1, alpha=0.001, userCol='userId', itemCol='movieId', ratingCol='rating')
model_1 = als_1.fit(training)

# 5. Đánh giá kết quả

In [13]:
predictions = model.transform(test)

In [14]:
predictions.show(5, False)

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|31     |1.0   |19    |0.84917116|
|31     |3.0   |7     |3.5913668 |
|31     |1.0   |29    |-0.7865362|
|31     |1.0   |0     |1.1094093 |
|85     |1.0   |23    |-2.4602509|
+-------+------+------+----------+
only showing top 5 rows



In [15]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)

In [16]:
rmse

1.9136859893240186

> **Nhận xét**
> * RMSE quá cao, `rating` có range là từ $[1, 5]$ mà RMSE là 1.81 là quá cao.

<hr>

In [17]:
predictions_1 = model_1.transform(test)

In [18]:
predictions_1.show(5, False)

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|31     |1.0   |19    |1.0588802 |
|31     |3.0   |7     |1.9613568 |
|31     |1.0   |29    |1.3128003 |
|31     |1.0   |0     |1.7009472 |
|85     |1.0   |23    |0.6054102 |
+-------+------+------+----------+
only showing top 5 rows



In [19]:
evaluator_1 = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse_1 = evaluator_1.evaluate(predictions_1)

In [20]:
rmse_1

0.8934605900007409

# 6. Dự đoán

In [21]:
userId = 27
single_user = test.filter(test['userId'] == userId).select(['movieId', 'userId'])

In [22]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      9|    27|
|     19|    27|
|     44|    27|
|     52|    27|
|     61|    27|
|     66|    27|
|     72|    27|
|     75|    27|
|     83|    27|
+-------+------+



In [23]:
recommendations = model.transform(single_user)

In [24]:
# prediction
recommendations.orderBy('prediction', ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|      9|    27|  2.4293168|
|     83|    27|  1.6857698|
|     72|    27|  1.4565545|
|     44|    27|  1.4292258|
|     66|    27|  1.1104711|
|     19|    27| 0.96073085|
|     61|    27|-0.23743144|
|     75|    27|  -0.265545|
|     52|    27| -1.1673692|
+-------+------+-----------+



# 7. Đưa ra đề xuất cho tất cả user

In [25]:
user_recs = model_1.recommendForAllUsers(3)

for user in user_recs.head(2):
    print(user)
    print('\n')

Row(userId=28, recommendations=[Row(movieId=89, rating=3.3902018070220947), Row(movieId=2, rating=3.281651020050049), Row(movieId=92, rating=3.0954973697662354)])


Row(userId=26, recommendations=[Row(movieId=22, rating=4.822081089019775), Row(movieId=32, rating=4.730332851409912), Row(movieId=51, rating=4.602514266967773)])




# 8. Đưa ra đề xuất cho một user

In [26]:
userId = 27
user_recs.filter(user_recs['userId'] == userId).show(truncate=False)

+------+-------------------------------------------------+
|userId|recommendations                                  |
+------+-------------------------------------------------+
|27    |[{18, 3.2424033}, {32, 2.882874}, {30, 2.806702}]|
+------+-------------------------------------------------+

