![](./images/09_03.jpg)

# 1. Đọc dữ liệu

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Recommendation_System").getOrCreate()

In [3]:
data = spark.read.json('./data/Musical_Instruments_5.json')

In [4]:
data.show(3)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
only showing top 3 rows



In [5]:
data.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [6]:
data.describe().show()

+-------+-------------+------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+
|summary|         asin|           overall|          reviewText|reviewTime|          reviewerID|        reviewerName|             summary|      unixReviewTime|
+-------+-------------+------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+
|  count|        10261|             10261|               10261|     10261|               10261|               10234|               10261|               10261|
|   mean|1.384719342E9| 4.488743787155248|                null|      null|                null|                null|                null|1.3606059557547998E9|
| stddev|          0.0|0.8946423761647279|                null|      null|                null|                null|                null| 3.779735074639003E7|
|    min|   1384719342|               1.0|    

# 2. Lựa chọn thuộc tính

In [7]:
data_sub = data.select('asin', 'overall', 'reviewerID')

In [8]:
data_sub.show(3)

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|1384719342|    5.0|A2IBPI20UZIR0U|
|1384719342|    5.0|A14VAT5EAX3D9S|
|1384719342|    5.0|A195EZSQDW3E21|
+----------+-------+--------------+
only showing top 3 rows



# 3. Chuẩn dữ liệu

In [9]:
from pyspark.sql.functions import col, udf, isnan, when, count

## 3.1. Kiểm tra dữ liệu có _null value_ hay ko

In [10]:
data_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


## 3.2. Đếm số lượng các unique value trên `asin` và `reviewerID`

In [11]:
users = data_sub.select('reviewerID').distinct().count()
products = data_sub.select('asin').distinct().count()
numerator = data_sub.count()

In [12]:
users, products, numerator

(1429, 900, 10261)

## 3.3. Tính toán sparsity
* Nếu xây dựng một **rating matrix** thì tổng số cell sẽ là

In [13]:
denominator = users*products

In [14]:
denominator

1286100

In [15]:
sparsity = 1 - (numerator / denominator)

In [16]:
sparsity

0.992021615737501

> **Nhận xét**
> * Mật độ thưa thớt rất cao

<hr>

## 3.4. Indexer cho `data_sub`

In [17]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [18]:
indexer = StringIndexer(inputCol='asin', outputCol='asin_idx')

In [19]:
indexer_model = indexer.fit(data_sub)

In [20]:
data_indexed = indexer_model.transform(data_sub)

In [21]:
indexer1 = StringIndexer(inputCol='reviewerID', outputCol='reviewerID_idx')

In [22]:
indexer1_model = indexer1.fit(data_indexed)

In [23]:
data_indexed = indexer1_model.transform(data_indexed)

In [24]:
data_indexed.show(3)

+----------+-------+--------------+--------+--------------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|
+----------+-------+--------------+--------+--------------+
|1384719342|    5.0|A2IBPI20UZIR0U|   703.0|          66.0|
|1384719342|    5.0|A14VAT5EAX3D9S|   703.0|         266.0|
|1384719342|    5.0|A195EZSQDW3E21|   703.0|         395.0|
+----------+-------+--------------+--------+--------------+
only showing top 3 rows



# 4. Tạo tập training và test data

In [25]:
training, test = data_indexed.randomSplit((0.8, 0.2))

# 5. Build model

In [26]:
from pyspark.ml.recommendation import ALS

In [27]:
als = ALS(maxIter=10,
            regParam=0.09,
            rank=25,
            userCol='reviewerID_idx',
            itemCol='asin_idx',
            ratingCol='overall',
            coldStartStrategy='drop',
            nonnegative=True)

In [28]:
model = als.fit(training)

# 6. Dự đoán

In [29]:
predictions = model.transform(test)

In [30]:
predictions.select('asin_idx', 'reviewerID_idx', 'overall', 'prediction').show(5)

+--------+--------------+-------+----------+
|asin_idx|reviewerID_idx|overall|prediction|
+--------+--------------+-------+----------+
|   148.0|        1403.0|    4.0| 3.3198583|
|   148.0|         465.0|    5.0| 4.9273977|
|   148.0|         121.0|    5.0|  4.687597|
|   463.0|        1226.0|    5.0|  4.460263|
|   471.0|         154.0|    5.0| 4.1507263|
+--------+--------------+-------+----------+
only showing top 5 rows



# 7. Đánh giá model

In [31]:
from pyspark.ml.evaluation import RegressionEvaluator

In [32]:
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='overall',
                                predictionCol='prediction')

In [33]:
rmse = evaluator.evaluate(predictions)

In [34]:
rmse

1.0945917511078211

> **Nhận xét**
> * Trên thang điểm là từ $[1, 5]$ thì nhìn chung rmse là 1 thì khá lớn, nhưng tạm chấp nhận dc

# 8. Dự đoán cho tất cả users

* Mỗi user lấy 20 sản phẩm tốt nhất

In [35]:
user_recs = model.recommendForAllUsers(20)

In [36]:
for user in user_recs.head(5):
    print(user)
    print("\n\n")

Row(reviewerID_idx=471, recommendations=[Row(asin_idx=25, rating=4.768749237060547), Row(asin_idx=49, rating=4.7623138427734375), Row(asin_idx=811, rating=4.661533832550049), Row(asin_idx=290, rating=4.64962100982666), Row(asin_idx=347, rating=4.621330261230469), Row(asin_idx=320, rating=4.592568397521973), Row(asin_idx=881, rating=4.556781768798828), Row(asin_idx=579, rating=4.555968284606934), Row(asin_idx=148, rating=4.511232376098633), Row(asin_idx=668, rating=4.505163669586182), Row(asin_idx=62, rating=4.503698348999023), Row(asin_idx=251, rating=4.499073505401611), Row(asin_idx=760, rating=4.495541572570801), Row(asin_idx=485, rating=4.484894752502441), Row(asin_idx=131, rating=4.484628200531006), Row(asin_idx=437, rating=4.484457969665527), Row(asin_idx=504, rating=4.478999614715576), Row(asin_idx=53, rating=4.477274417877197), Row(asin_idx=274, rating=4.46561861038208), Row(asin_idx=699, rating=4.463167667388916)])



Row(reviewerID_idx=1342, recommendations=[Row(asin_idx=337, 

# 9. Chuyển đổi lại sang pandas.DataFrame

In [37]:
import pandas as pd
import pyspark

In [38]:
recs = user_recs.toPandas()

In [39]:
recs

Unnamed: 0,reviewerID_idx,recommendations
0,471,"[(25, 4.768749237060547), (49, 4.7623138427734..."
1,1342,"[(337, 5.52674674987793), (421, 5.513857841491..."
2,463,"[(436, 4.945775508880615), (228, 4.82504081726..."
3,833,"[(775, 4.778197765350342), (821, 4.76319313049..."
4,496,"[(881, 5.427275657653809), (515, 5.34544277191..."
...,...,...
1424,422,"[(579, 5.587418556213379), (347, 5.44565153121..."
1425,1413,"[(881, 4.8156514167785645), (354, 4.4654445648..."
1426,517,"[(579, 5.504660606384277), (437, 5.10363101959..."
1427,1138,"[(515, 5.340065002441406), (242, 5.32987785339..."


In [40]:
nrecs = recs['recommendations'].apply(pd.Series).merge(recs, right_index=True, left_index=True) \
                                                .drop(['recommendations'], axis=1) \
                                                .melt(id_vars=['reviewerID_idx'], value_name='recommendation') \
                                                .drop('variable', axis=1)\
                                                .dropna()

In [41]:
nrecs = nrecs.sort_values('reviewerID_idx')
nrecs = pd.concat([nrecs['recommendation'].apply(pd.Series),
                    nrecs['reviewerID_idx']], axis=1)
nrecs.columns = ['ProductID_index', 'Rating', 'UserID_index']

In [42]:
nrecs

Unnamed: 0,ProductID_index,Rating,UserID_index
7085,188.0,4.981926,0
1369,769.0,5.217552,0
2798,421.0,5.116728,0
24233,780.0,4.916155,0
9943,504.0,4.956274,0
...,...,...,...
24165,352.0,5.152024,1428
19878,732.0,5.174199,1428
27023,596.0,5.142692,1428
7017,307.0,5.253382,1428


In [43]:
md = data_indexed.select(['reviewerID', 'reviewerID_idx', 'asin', 'asin_idx']).toPandas()
dict1 = dict(zip(md['reviewerID_idx'], md['reviewerID']))
dict2 = dict(zip(md['asin_idx'], md['asin']))
nrecs['reviewerID'] = nrecs['UserID_index'].map(dict1)
nrecs['asin'] = nrecs['ProductID_index'].map(dict2)
nrecs = nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new = nrecs[['reviewerID', 'asin', 'Rating']]
new['recommendations'] = list(zip(new['asin'], new['Rating']))
res = new[['reviewerID', 'recommendations']]
res_new = res['recommendations'].groupby([res['reviewerID']]).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new['asin'], new['Rating']))


In [44]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00625243BI8W1SSZNLMD,"[(B000068O1N, 5.075376510620117), (B0015RIN6U,..."
1,A10044ECXDUVKS,"[(B003BFYDBS, 4.450658798217773), (B00923G9Q0,..."
2,A102MU6ZC9H1N6,"[(B0073XCYO2, 5.112588405609131), (B000OR5928,..."
3,A109JTUZXO61UY,"[(B0015RIN6U, 5.419037818908691), (B000RY68PA,..."
4,A109ME7C09HM2M,"[(B009E3EWPI, 5.347193717956543), (B000RWJQRE,..."
...,...,...
1424,AZJPNK73JF3XP,"[(B000RKVH0K, 5.327242374420166), (B00C5B20QE,..."
1425,AZMHABTPXVLG3,"[(B000068O1N, 4.04396390914917), (B002HFC6P8, ..."
1426,AZMIKIG4BB6BZ,"[(B000S5V510, 5.176730155944824), (B0002GWXKC,..."
1427,AZPDO6FLSMLFP,"[(B000RKVH0K, 5.191473484039307), (B000U0DU34,..."
