![](images/11_01.jpg)

# 1. Đọc dữ liệu

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('nlp_musical').getOrCreate()

In [3]:
data = spark.read.json("./data/Musical_Instruments_5.json")

In [4]:
data.show(5)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [5]:
from pyspark.sql.functions import *

In [6]:
data = data.withColumn('class', when(data['overall'] >= 4, 'like')
                                .when(data['overall'] <= 2, 'not_like')
                                .otherwise("neutral"))

In [7]:
data = data.select('reviewText', 'overall', 'class')

# 2. Làm sạch và chuẩn dữ liệu

In [8]:
data = data.withColumn('length', length(data['reviewText']))

In [9]:
data.show(5)

+--------------------+-------+-----+------+
|          reviewText|overall|class|length|
+--------------------+-------+-----+------+
|Not much to write...|    5.0| like|   268|
|The product does ...|    5.0| like|   544|
|The primary job o...|    5.0| like|   436|
|Nice windscreen p...|    5.0| like|   206|
|This pop filter i...|    5.0| like|   159|
+--------------------+-------+-----+------+
only showing top 5 rows



In [10]:
data.groupby('class').mean().show()

+--------+------------------+-----------------+
|   class|      avg(overall)|      avg(length)|
+--------+------------------+-----------------+
|not_like|1.5353319057815846|579.2055674518201|
| neutral|               3.0|579.2111398963731|
|    like|4.7690090888938155|473.1188206606074|
+--------+------------------+-----------------+



In [11]:
data.groupBy('class').count().show()

+--------+-----+
|   class|count|
+--------+-----+
|not_like|  467|
| neutral|  772|
|    like| 9022|
+--------+-----+



# 3. Feature transformations

In [12]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer,VectorAssembler

In [13]:
tokenizer = Tokenizer(inputCol='reviewText', outputCol='token_text')
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
class_to_num = StringIndexer(inputCol='class', outputCol='label')

In [14]:
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

# 4. Pipeline 

In [15]:
from pyspark.ml import Pipeline

In [16]:
data_prep_pipe = Pipeline(stages=[class_to_num, tokenizer, stopremove, count_vec, idf, clean_up])

In [17]:
cleaner = data_prep_pipe.fit(data)

In [18]:
clean_data = cleaner.transform(data)

# 5. Tách dữ liệu train và test

In [19]:
clean_data = clean_data.select('label', 'features')

In [20]:
clean_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(51949,[3,12,14,3...|
|  0.0|(51949,[2,3,12,16...|
|  0.0|(51949,[11,19,44,...|
|  0.0|(51949,[18,37,57,...|
|  0.0|(51949,[2,122,132...|
+-----+--------------------+
only showing top 5 rows



In [21]:
training, testing = clean_data.randomSplit((.7, .3))

In [22]:
training.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6373|
|  1.0|  560|
|  2.0|  323|
+-----+-----+



# 6. Build model
## 6.1. Bằng Naive Bayes

In [23]:
from pyspark.ml.classification import NaiveBayes

In [24]:
nb = NaiveBayes()

In [25]:
predictor = nb.fit(training)

In [26]:
test_result = predictor.transform(testing)

In [27]:
test_result.show(3)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(51949,[0],[1.025...|[-6.4303442138508...|[0.88433370870768...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-8039.6692015177...|[1.20288693328170...|       2.0|
|  0.0|(51949,[0,1,2,3,4...|[-9444.5825286226...|[3.92316017105901...|       1.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [28]:
test_result.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   63|
|  1.0|       1.0|   75|
|  0.0|       1.0|  532|
|  1.0|       0.0|  123|
|  2.0|       2.0|   33|
|  2.0|       1.0|   48|
|  1.0|       2.0|   14|
|  0.0|       0.0| 1910|
|  0.0|       2.0|  207|
+-----+----------+-----+



In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [30]:
acc_eval = MulticlassClassificationEvaluator()

In [31]:
acc = acc_eval.evaluate(test_result)

In [32]:
acc

0.7298356480665139

> **Nhận xét**
> * Độ chính xác thấp

## 6.2. Build bằng Logistic regression

In [33]:
from pyspark.ml.classification import LogisticRegression

In [34]:
lg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [35]:
predictor_1 = lg.fit(training)

In [36]:
test_result_1 = predictor_1.transform(testing)

In [37]:
test_result_1.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  139|
|  1.0|       1.0|    2|
|  0.0|       1.0|    3|
|  1.0|       0.0|  209|
|  2.0|       2.0|    3|
|  2.0|       1.0|    2|
|  1.0|       2.0|    1|
|  0.0|       0.0| 2644|
|  0.0|       2.0|    2|
+-----+----------+-----+



In [38]:
acc_1 = acc_eval.evaluate(test_result_1)

In [39]:
acc_1

0.8295721122041596

> **Nhận xét**
> * Độ chính xác có cải thiện

## 6.3. Áp dụng random forest

In [40]:
from pyspark.ml.classification import RandomForestClassifier

In [41]:
rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=500, maxDepth=5, maxBins=64)

In [42]:
predictor_2 = rf.fit(training)

In [43]:
test_result_2 = rf.fit(training)

In [44]:
test_result_2 = predictor_2.transform(testing)

In [45]:
test_result_2.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  144|
|  1.0|       0.0|  212|
|  0.0|       0.0| 2649|
+-----+----------+-----+



In [46]:
acc_2 = acc_eval.evaluate(test_result_2)

In [47]:
acc_2

0.8260258371409047

> **Nhận xét**
> * Độ chính xác cao nhưng kết quả cho ra ko tốt

# 8. Resampling data

In [48]:
like_df = training.filter(col('label') == 0)
neutral_df = training.filter(col('label') == 1)
not_like_df = training.filter(col('label') == 2)
ratio_1 = int(like_df.count()/neutral_df.count())
ratio_2 = int(like_df.count()/not_like_df.count())

In [49]:
ratio_1, ratio_2

(11, 19)

* Resample neutral

In [50]:
a1 = range(ratio_1)

In [51]:
oversampled_neutral_df = neutral_df.withColumn('dummy', explode(array([lit(x) for x in a1]))).drop('dummy')

In [52]:
combined_df = like_df.unionAll(oversampled_neutral_df)

In [53]:
combined_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(51949,[0],[1.025...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
+-----+--------------------+
only showing top 5 rows



In [54]:
combined_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6373|
|  1.0| 6160|
+-----+-----+



In [55]:
a2 = range(ratio_2)

In [56]:
oversampled_notlike_df = not_like_df.withColumn('dummy', explode(array([lit(x) for x in a2]))).drop('dummy')

In [57]:
combined_df = combined_df.unionAll(oversampled_notlike_df)

In [58]:
combined_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(51949,[0],[1.025...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
+-----+--------------------+
only showing top 5 rows



In [59]:
combined_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6373|
|  1.0| 6160|
|  2.0| 6137|
+-----+-----+



In [60]:
predictor_4 = nb.fit(combined_df)

In [61]:
test_result_4 = predictor_4.transform(testing)

In [62]:
test_result_4.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(51949,[0],[1.025...|[-7.3751807657850...|[0.32985809918260...|       1.0|
|  0.0|(51949,[0,1,2,3,4...|[-8040.6140380697...|[1.0,5.6113723706...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-9445.5273651746...|[1.0,1.0949387349...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-19080.435082691...|       [1.0,0.0,0.0]|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-3373.9498746200...|[1.0,1.7067128318...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [63]:
test_result_4.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  117|
|  1.0|       1.0|   28|
|  0.0|       1.0|  144|
|  1.0|       0.0|  179|
|  2.0|       2.0|   17|
|  2.0|       1.0|   10|
|  1.0|       2.0|    5|
|  0.0|       0.0| 2449|
|  0.0|       2.0|   56|
+-----+----------+-----+



In [64]:
acc_eval.evaluate(test_result_4)

0.817836880816284

<hr>

In [65]:
predictor_5 = lg.fit(combined_df)

In [66]:
test_result_5 = predictor_5.transform(testing)

In [67]:
test_result_5.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  112|
|  1.0|       1.0|   21|
|  0.0|       1.0|   82|
|  1.0|       0.0|  187|
|  2.0|       2.0|   21|
|  2.0|       1.0|   11|
|  1.0|       2.0|    4|
|  0.0|       0.0| 2554|
|  0.0|       2.0|   13|
+-----+----------+-----+



In [68]:
acc_eval.evaluate(test_result_5)

0.8385517156955946

<hr>

In [69]:
predictor_6 = rf.fit(combined_df)

In [70]:
test_result_6 = predictor_6.transform(testing)

In [71]:
test_result_6.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   98|
|  1.0|       1.0|   23|
|  0.0|       1.0|   39|
|  1.0|       0.0|  178|
|  2.0|       2.0|   39|
|  2.0|       1.0|    7|
|  1.0|       2.0|   11|
|  0.0|       0.0| 2559|
|  0.0|       2.0|   51|
+-----+----------+-----+



In [72]:
acc_eval.evaluate(test_result_6)

0.8495029310028307