![](../images/FE_00.png)

# 6. Đọc dữ liệu đã làm sạch từ file parquet đã làm từ **part_1**

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Womens_Clothing_E_Commerce_Reviews').getOrCreate()

In [4]:
data = spark.read.parquet("../data/womens-ecommerce-clothing-reviews/womens-ecommerce-clothing-reviews_clean_data.parquet")

In [5]:
data.show(5)

+---+------+--------------------+--------------+---------------------+---------+-------------+--------------------+--------------------+--------------------+--------------------+
|Age|Rating|          ReviewText|RecommendedIND|PositiveFeedbackCount|ClassName|ClassName_idx|      ReviewText_tok|      ReviewText_stp|      ReviewText_cvt|      ReviewText_idf|
+---+------+--------------------+--------------+---------------------+---------+-------------+--------------------+--------------------+--------------------+--------------------+
| 46|     5|I tried these on ...|             1|                    8|    Pants|          4.0|[i, tried, these,...|[tried, whim, lik...|(14158,[3,8,14,19...|(14158,[3,8,14,19...|
| 65|     4|Great feature...p...|             1|                    0|    Knits|          1.0|[great, feature, ...|[great, feature, ...|(14158,[3,5,8,16,...|(14158,[3,5,8,16,...|
| 32|     3|I'm usually an xs...|             1|                    0|  Dresses|          0.0|[i, m, usua

# 7. Chuyển dữ liệu

In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
features = ['Age', 'RecommendedIND', 'PositiveFeedbackCount', 'ClassName_idx', 'ReviewText_idf']
target = 'Rating'

In [8]:
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [9]:
final_data = assembler.transform(data).select('features', target)

In [10]:
final_data.show(5)

+--------------------+------+
|            features|Rating|
+--------------------+------+
|(14162,[0,1,2,3,7...|     5|
|(14162,[0,1,3,7,9...|     4|
|(14162,[0,1,4,8,1...|     3|
|(14162,[0,1,2,3,1...|     4|
|(14162,[0,1,4,6,8...|     5|
+--------------------+------+
only showing top 5 rows



# 8. Tách dữ liệu train test

In [11]:
train, test = final_data.randomSplit((0.8, 0.2))

# 9. Xem các group của `Rating` đã cân bằng trên `train` data chưa

In [12]:
tmp = train.groupBy(target).count()

In [13]:
tmp.show()

+------+-----+
|Rating|count|
+------+-----+
|     1|  658|
|     3| 2253|
|     5| 9949|
|     4| 3916|
|     2| 1269|
+------+-----+



> **Nhận xét:**
> * Dữ liệu train gồm $18,147$ mẫu

In [14]:
from pyspark.sql.functions import col

In [15]:
tmp1 = train.count()

In [16]:
tmp2 = tmp.select('Rating', col('count')/tmp1)

In [17]:
tmp2.show()

+------+--------------------+
|Rating|     (count / 18045)|
+------+--------------------+
|     1|0.036464394569132724|
|     3| 0.12485453034081463|
|     5|  0.5513438625658077|
|     4|  0.2170130229980604|
|     2| 0.07032418952618454|
+------+--------------------+



> **Nhận xét**:
> * Các group 1, 2, 3 chiếm số lượng quá nhỏ trong tập dữ liệu, cần oversampling chúng

# 10. Ghi `train` và `test` data ra file parquet

In [18]:
train.write.parquet("../data/womens-ecommerce-clothing-reviews/womens-ecommerce-clothing-reviews_train.parquet")

In [19]:
test.write.parquet("../data/womens-ecommerce-clothing-reviews/womens-ecommerce-clothing-reviews_test.parquet")

# 11. Áp dụng Oversampling cho các group 1, 2, 3 trên `train` data

In [20]:
from modules.utils import oversampling

In [21]:
# oversampling(pDf: pyspark.sql.DataFrame, pColumn: str, pMajorValue, pMinorValue):
data_resampling = oversampling(train, target, 4, 1)
data_resampling = data_resampling.unionAll(train)

In [22]:
data_resampling.groupBy(target).count().show()

+------+-----+
|Rating|count|
+------+-----+
|     1| 3948|
|     3| 2253|
|     5| 9949|
|     4| 7832|
|     2| 1269|
+------+-----+



In [23]:
data_resampling = oversampling(data_resampling, target, 3, 2).unionAll(data_resampling)

In [24]:
data_resampling.groupBy(target).count().show()


+------+-----+
|Rating|count|
+------+-----+
|     1| 3948|
|     3| 4506|
|     5| 9949|
|     4| 7832|
|     2| 2538|
+------+-----+



# 12. Ghi `data_resampling` ra file parquet

In [25]:
data_resampling.write.parquet("../data/womens-ecommerce-clothing-reviews/womens-ecommerce-clothing-reviews_resampling.parquet")