![](./images/09_04.jpg)

# 1. Đọc dữ liệu

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Recommendation_Beauty').getOrCreate()

In [3]:
data = spark.read.json("../../local_data/Beauty_5.json")

In [4]:
data.show(5)

+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|7806397051| [3, 4]|    1.0|Very oily and cre...|01 30, 2014|A1YJEY40YUW4SE|      Andrea|Don't waste your ...|    1391040000|
|7806397051| [1, 1]|    3.0|This palette was ...|04 18, 2014| A60XNB876KYML|  Jessica H.|         OK Palette!|    1397779200|
|7806397051| [0, 1]|    4.0|The texture of th...| 09 6, 2013|A3G6XNM240RMWA|       Karen|       great quality|    1378425600|
|7806397051| [2, 2]|    2.0|I really can't te...| 12 8, 2013|A1PQFP6SAJ6D80|       Norah|Do not work on my...|    1386460800|
|7806397051| [0, 0]|    3.0|It was a little s...|10 19, 2013|A38FVHZTNQ271F|   Nova Amor|          It's okay.|    1382

In [5]:
data_sub = data.select(['asin', 'overall', 'reviewerID'])

In [6]:
data_sub.count()

198502

# 2. Chuẩn dữ liệu

In [7]:
from pyspark.sql.functions import col, udf, isnan, when, count

In [8]:
data_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [9]:
users = data_sub.select('reviewerID').distinct().count()
products = data_sub.select('asin').distinct().count()
numerator = data_sub.count()

In [10]:
users, products, numerator

(22363, 12101, 198502)

* Tính toán độ thưa thớt của ma trận

In [11]:
denominator = users * products

In [12]:
denominator

270614663

In [13]:
sparsity = 1 - (numerator / denominator)

In [14]:
sparsity

0.9992664772935825

## 2.1. Indexer cho `data_sub`

In [15]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [16]:
indexer = StringIndexer(inputCol='asin', outputCol='asin_idx')

In [17]:
indexer_model = indexer.fit(data_sub)

In [18]:
data_indexed = indexer_model.transform(data_sub)

In [19]:
indexer1 = StringIndexer(inputCol='reviewerID', outputCol='reviewerID_idx')

In [20]:
indexer1_model = indexer1.fit(data_indexed)

In [21]:
data_indexed = indexer1_model.transform(data_indexed)

In [22]:
data_indexed.show(3)

+----------+-------+--------------+--------+--------------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|
+----------+-------+--------------+--------+--------------+
|7806397051|    1.0|A1YJEY40YUW4SE|  6194.0|       16983.0|
|7806397051|    3.0| A60XNB876KYML|  6194.0|       10399.0|
|7806397051|    4.0|A3G6XNM240RMWA|  6194.0|        5985.0|
+----------+-------+--------------+--------+--------------+
only showing top 3 rows



# 3. Tạo tập training và test data

In [23]:
training, test = data_indexed.randomSplit((0.8, 0.2))

# 4. Build model

In [24]:
from pyspark.ml.recommendation import ALS

In [25]:
als = ALS(maxIter=10,
            regParam=0.09,
            rank=25,
            userCol='reviewerID_idx',
            itemCol='asin_idx',
            ratingCol='overall',
            coldStartStrategy='drop',
            nonnegative=True)

In [26]:
model = als.fit(training)

# 5. Dự đoán

In [27]:
predictions = model.transform(test)

In [28]:
predictions.select('asin_idx', 'reviewerID_idx', 'overall', 'prediction').show(5)

+--------+--------------+-------+----------+
|asin_idx|reviewerID_idx|overall|prediction|
+--------+--------------+-------+----------+
|   148.0|         493.0|    4.0| 3.6282616|
|   148.0|           1.0|    5.0| 3.9160838|
|   148.0|        3764.0|    4.0| 3.3301628|
|   148.0|       16014.0|    5.0| 3.3559623|
|   148.0|         455.0|    5.0| 3.6481733|
+--------+--------------+-------+----------+
only showing top 5 rows



# 6. Đánh giá model

In [29]:
from pyspark.ml.evaluation import RegressionEvaluator

In [30]:
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='overall',
                                predictionCol='prediction')

In [31]:
rmse = evaluator.evaluate(predictions)

In [32]:
rmse

1.2859662330735018

> **Nhận xét**
> * Trên thang điểm là từ $[1, 5]$ thì nhìn chung rmse là 1 thì khá lớn, nhưng tạm chấp nhận dc

# 7. Dự đoán cho tất cả users

In [33]:
user_recs = model.recommendForAllUsers(20)

In [34]:
for user in user_recs.head(5):
    print(user)
    print("\n\n")

Row(reviewerID_idx=1580, recommendations=[Row(asin_idx=8643, rating=5.330796241760254), Row(asin_idx=5867, rating=5.309784889221191), Row(asin_idx=8890, rating=5.287563800811768), Row(asin_idx=5137, rating=5.26399564743042), Row(asin_idx=10281, rating=5.249861717224121), Row(asin_idx=12075, rating=5.138065338134766), Row(asin_idx=12067, rating=5.138065338134766), Row(asin_idx=12068, rating=5.138065338134766), Row(asin_idx=4116, rating=5.039095401763916), Row(asin_idx=12071, rating=5.028798580169678), Row(asin_idx=11395, rating=5.0271430015563965), Row(asin_idx=12069, rating=5.0271430015563965), Row(asin_idx=11233, rating=4.978743553161621), Row(asin_idx=3591, rating=4.9759840965271), Row(asin_idx=11754, rating=4.963536262512207), Row(asin_idx=4104, rating=4.9616217613220215), Row(asin_idx=8113, rating=4.942749977111816), Row(asin_idx=7517, rating=4.935157299041748), Row(asin_idx=9328, rating=4.930989742279053), Row(asin_idx=11108, rating=4.928097248077393)])



Row(reviewerID_idx=4900,

# 8. Chuyển đổi lại sang hiển thị ID thật của user và product

In [35]:
import pandas as pd
import pyspark

In [36]:
recs = user_recs.toPandas()

In [37]:
recs

Unnamed: 0,reviewerID_idx,recommendations
0,1580,"[(8643, 5.330796241760254), (5867, 5.309784889..."
1,4900,"[(6714, 5.990791320800781), (2981, 5.877415657..."
2,5300,"[(4097, 6.324025630950928), (10644, 6.26779985..."
3,6620,"[(4327, 5.79592752456665), (6059, 5.7842936515..."
4,7240,"[(5137, 5.48166561126709), (12068, 5.416862487..."
...,...,...
22356,18729,"[(8890, 3.915904998779297), (4529, 3.896273612..."
22357,21499,"[(4376, 4.627009868621826), (8373, 4.031133651..."
22358,21599,"[(12067, 6.012242794036865), (12075, 6.0122427..."
22359,21819,"[(4227, 5.501155853271484), (10661, 5.47741746..."


In [38]:
nrecs = recs['recommendations'].apply(pd.Series).merge(recs, right_index=True, left_index=True) \
                                                .drop(['recommendations'], axis=1) \
                                                .melt(id_vars=['reviewerID_idx'], value_name='recommendation') \
                                                .drop('variable', axis=1)\
                                                .dropna()

In [39]:
nrecs = nrecs.sort_values('reviewerID_idx')
nrecs = pd.concat([nrecs['recommendation'].apply(pd.Series),
                    nrecs['reviewerID_idx']], axis=1)
nrecs.columns = ['ProductID_index', 'Rating', 'UserID_index']

In [40]:
nrecs

Unnamed: 0,ProductID_index,Rating,UserID_index
334461,5687.0,5.439965,0
43768,2730.0,5.706645,0
245017,8637.0,5.464397,0
289739,12067.0,5.449728,0
200295,3207.0,5.481113,0
...,...,...,...
308650,10814.0,5.218294,22362
85040,12071.0,5.499613,22362
129762,12069.0,5.498857,22362
196845,7334.0,5.352735,22362


In [41]:
md = data_indexed.select(['reviewerID', 'reviewerID_idx', 'asin', 'asin_idx']).toPandas()
dict1 = dict(zip(md['reviewerID_idx'], md['reviewerID']))
dict2 = dict(zip(md['asin_idx'], md['asin']))
nrecs['reviewerID'] = nrecs['UserID_index'].map(dict1)
nrecs['asin'] = nrecs['ProductID_index'].map(dict2)
nrecs = nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new = nrecs[['reviewerID', 'asin', 'Rating']]
new['recommendations'] = list(zip(new['asin'], new['Rating']))
res = new[['reviewerID', 'recommendations']]
res_new = res['recommendations'].groupby([res['reviewerID']]).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new['asin'], new['Rating']))


In [42]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00414041RD0BXM6WK0GX,"[(B007UUE824, 4.85647439956665), (B00GRTQBTM, ..."
1,A00473363TJ8YSZ3YAGG9,"[(B001CJJ5XG, 4.142149448394775), (B0000Y8H3S,..."
2,A00700212KB3K0MVESPIY,"[(B00H8JPMX6, 5.4662675857543945), (B000PHP8L4..."
3,A0078719IR14X3NNUG0F,"[(B003Z4OD24, 6.351077556610107), (B0006IXSG4,..."
4,A01198201H0E3GHV2Z17I,"[(B0045SU5ZI, 5.226602554321289), (B000TD2QXC,..."
...,...,...
22356,AZZNK89PXD006,"[(B000C1W38O, 3.896512985229492), (B00HAQAREQ,..."
22357,AZZQXL8VDCFTV,"[(B001EXHT2O, 5.426084995269775), (B000TUB4BU,..."
22358,AZZT1ERHBSNQ8,"[(B000JLAWIA, 5.530824184417725), (B000ORV5HQ,..."
22359,AZZU6NXB8YJN9,"[(B000A3I2X4, 5.121776580810547), (B003JMYYQW,..."
