![](../images/FE_00.png)

# 1. Đọc dữ liệu đã làm sạch từ file parquet đã làm từ **part_1**

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Womens_Clothing_E_Commerce_Reviews').getOrCreate()

In [4]:
data = spark.read.parquet("../data/womens-ecommerce-clothing-reviews/womens-ecommerce-clothing-reviews_clean_data.parquet")

In [5]:
data.show(5)

+---+------+--------------------+--------------+---------------------+---------+-------------+--------------------+--------------------+--------------------+--------------------+
|Age|Rating|          ReviewText|RecommendedIND|PositiveFeedbackCount|ClassName|ClassName_idx|      ReviewText_tok|      ReviewText_stp|      ReviewText_cvt|      ReviewText_idf|
+---+------+--------------------+--------------+---------------------+---------+-------------+--------------------+--------------------+--------------------+--------------------+
| 46|     5|I tried these on ...|             1|                    8|    Pants|          4.0|[i, tried, these,...|[tried, whim, lik...|(14158,[3,8,14,19...|(14158,[3,8,14,19...|
| 65|     4|Great feature...p...|             1|                    0|    Knits|          1.0|[great, feature, ...|[great, feature, ...|(14158,[3,5,8,16,...|(14158,[3,5,8,16,...|
| 32|     3|I'm usually an xs...|             1|                    0|  Dresses|          0.0|[i, m, usua

# 2. Chuyển dữ liệu

In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
features = ['Age', 'RecommendedIND', 'PositiveFeedbackCount', 'ClassName_idx', 'ReviewText_idf']
target = 'Rating'

In [8]:
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [9]:
final_data = assembler.transform(data).select('features', target)

In [10]:
final_data.show(5)

+--------------------+------+
|            features|Rating|
+--------------------+------+
|(14162,[0,1,2,3,7...|     5|
|(14162,[0,1,3,7,9...|     4|
|(14162,[0,1,4,8,1...|     3|
|(14162,[0,1,2,3,1...|     4|
|(14162,[0,1,4,6,8...|     5|
+--------------------+------+
only showing top 5 rows



# 3. Xem các group của `Rating` đã cân bằng chưa

In [11]:
tmp = final_data.groupBy(target).count()

In [12]:
tmp.show()

+------+-----+
|Rating|count|
+------+-----+
|     1|  820|
|     3| 2822|
|     5|12523|
|     4| 4907|
|     2| 1548|
+------+-----+



In [13]:
tmp1 = final_data.count()

In [14]:
tmp1

22620

In [15]:
from pyspark.sql.functions import col

In [16]:
tmp2 = tmp.select('Rating', col('count')/tmp1)

In [17]:
tmp2.show()

+------+--------------------+
|Rating|     (count / 22620)|
+------+--------------------+
|     1|0.036251105216622455|
|     3| 0.12475685234305924|
|     5|  0.5536251105216623|
|     4|  0.2169319186560566|
|     2| 0.06843501326259947|
+------+--------------------+



> **Nhận xét**:
> * Các group 1, 2, 3 chiếm số lượng quá nhỏ trong tập dữ liệu, cần oversampling chúng

# 4. Áp dụng Oversampling cho các group 1, 2, 3

In [18]:
from modules.utils import oversampling

In [19]:
# oversampling(pDf: pyspark.sql.DataFrame, pColumn: str, pMajorValue, pMinorValue):
data_sempling = oversampling(final_data, target, 4, 1)
data_sempling = data_sempling.unionAll(final_data)

In [20]:
data_sempling.groupBy(target).count().show()

+------+-----+
|Rating|count|
+------+-----+
|     1| 4920|
|     3| 2822|
|     5|12523|
|     4| 9814|
|     2| 1548|
+------+-----+



In [21]:
data_sempling = oversampling(data_sempling, target, 3, 2).unionAll(data_sempling)

In [22]:
data_sempling.groupBy(target).count().show()


+------+-----+
|Rating|count|
+------+-----+
|     1| 4920|
|     3| 5644|
|     5|12523|
|     4| 9814|
|     2| 3096|
+------+-----+



# 5. Tách dữ liệu train test

In [23]:
data_sempling.show(5)

+--------------------+------+
|            features|Rating|
+--------------------+------+
|(14162,[0,1,4,8,1...|     3|
|(14162,[0,1,2,3,8...|     3|
|(14162,[0,3,5,12,...|     3|
|(14162,[0,2,3,5,1...|     3|
|(14162,[0,1,2,3,1...|     3|
+--------------------+------+
only showing top 5 rows



In [24]:
train, test = data_sempling.randomSplit((0.8, 0.2))

# 6. Build model
## 6.1. Logistic Regression
### 6.1.1. Build model

In [25]:
from pyspark.ml.classification import LogisticRegression

In [26]:
logistic = LogisticRegression(labelCol=target)

In [27]:
logistic_model_0 = logistic.fit(train)

### 6.1.2. Đánh giá model
#### 6.1.2.1. Trên train

In [28]:
logistic_train_res_0 = logistic_model_0.evaluate(train).predictions

In [29]:
logistic_train_res_0.show(5)

+--------------------+------+--------------------+--------------------+----------+
|            features|Rating|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(14162,[0,1,2,3,5...|     3|[-7.2919727746246...|[2.39322398868903...|       3.0|
|(14162,[0,1,2,3,5...|     3|[-7.2905506980260...|[4.16238553024314...|       3.0|
|(14162,[0,1,2,3,5...|     3|[-7.3144170086712...|[7.92955940845964...|       3.0|
|(14162,[0,1,2,3,5...|     3|[-7.2378497226699...|[4.88671502121080...|       3.0|
|(14162,[0,1,2,3,5...|     3|[-7.3329497665807...|[4.68334405917946...|       3.0|
+--------------------+------+--------------------+--------------------+----------+
only showing top 5 rows



In [30]:
logistic_train_res_0.groupBy(target, 'prediction').count().show()

+------+----------+-----+
|Rating|prediction|count|
+------+----------+-----+
|     5|       4.0|  173|
|     4|       4.0| 7604|
|     5|       5.0| 9900|
|     4|       5.0|  165|
|     3|       3.0| 4545|
|     3|       5.0|    2|
|     2|       2.0| 2460|
|     1|       1.0| 3974|
+------+----------+-----+



#### 6.1.2.2. Trên test data

In [31]:
logistic_test_res_0 = logistic_model_0.evaluate(test).predictions

In [32]:
logistic_test_res_0.groupBy(target, 'prediction').count().show()

+------+----------+-----+
|Rating|prediction|count|
+------+----------+-----+
|     2|       1.0|    8|
|     4|       3.0|   72|
|     2|       4.0|    6|
|     4|       1.0|    2|
|     3|       4.0|   26|
|     5|       4.0|  610|
|     3|       1.0|   10|
|     4|       4.0| 1722|
|     5|       5.0| 1664|
|     5|       2.0|   34|
|     5|       1.0|    1|
|     2|       3.0|   58|
|     3|       2.0|   50|
|     4|       5.0|  241|
|     3|       3.0|  977|
|     3|       5.0|   34|
|     2|       2.0|  556|
|     1|       1.0|  946|
|     5|       3.0|  141|
|     2|       5.0|    8|
+------+----------+-----+
only showing top 20 rows



#### 6.1.2.3. Confusion matrix

In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [34]:
predictions_labels = logistic_test_res_0.withColumnRenamed('Rating', 'label').select('prediction', 'label')

In [35]:
predictions_labels.show(5)

+----------+-----+
|prediction|label|
+----------+-----+
|       3.0|    3|
|       3.0|    3|
|       3.0|    3|
|       3.0|    3|
|       3.0|    3|
+----------+-----+
only showing top 5 rows



#### 6.1.2.4. Dựa theo **Accuracy**, **F1-Score**, **Precision**, **Recall** 

In [36]:
evaluator = MulticlassClassificationEvaluator()

In [37]:
evaluator.evaluate(predictions_labels, {evaluator.metricName: "accuracy"})

0.8175355450236966

In [38]:
evaluator.evaluate(predictions_labels, {evaluator.metricName: "f1"})

0.8157614702221767

In [39]:
evaluator.evaluate(predictions_labels, {evaluator.metricName: "weightedPrecision"})

0.8242917519833826

In [40]:
evaluator.evaluate(predictions_labels, {evaluator.metricName: "weightedRecall"})

0.8175355450236967

> **Nhận xét:**
> * Các chỉ số đánh giá sức mạnh của model nhìn chung rất tốt, đều trên 80%.