# Cleaning Data

In [1]:
from pyspark.sql import SparkSession

# Khởi tạo SparkSession
spark = SparkSession.builder \
    .appName("Data Cleaning") \
    .getOrCreate()

In [2]:
df = spark.read.json('../data/Amazon/Books.json')

In [5]:
df_meta = spark.read.json('../data/Amazon/meta_Books.json')

In [3]:
df.show() 

+----------+-----+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------------+--------------+--------+----+
|      asin|image|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|               style|             summary|unixReviewTime|verified|vote|
+----------+-----+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------------+--------------+--------+----+
|0001713353| null|    5.0|This book is a wi...|08 12, 2005|A1C6M8LCIX4M6M|            June Bug|{null,  Paperback...| Children's favorite|    1123804800|   false|null|
|0001713353| null|    5.0|The King, the Mic...|03 30, 2005|A1REUF3A1YCPHM|         TW Ervin II|{null,  Hardcover...|A story children ...|    1112140800|   false|null|
|0001713353| null|    5.0|My daughter got h...| 04 4, 2004| A1YRBRK2XM5D5|   Rebecca L. Menner|{null,  Hardcover...|          Third copy|    1081036800|   false|   5

In [6]:
# Lựa chọn các cột cần merge
df_meta_selected = df_meta.select('title', 'asin')

# Thực hiện merge bằng cách join DataFrame
df = df.join(df_meta_selected, on=['asin'], how='left')

In [7]:
# Đếm số lượng dòng trong DataFrame
row_count = df.count()

# Lấy danh sách các cột trong DataFrame
column_list = df.columns

print("Số lượng dòng: ", row_count)
print("Danh sách các cột: ", column_list)

Số lượng dòng:  51343361
Danh sách các cột:  ['asin', 'image', 'overall', 'reviewText', 'reviewTime', 'reviewerID', 'reviewerName', 'style', 'summary', 'unixReviewTime', 'verified', 'vote', 'title']


In [8]:
from pyspark.sql.functions import col, sum as spark_sum

# Kiểm tra giá trị null trong DataFrame
null_counts = df.select([spark_sum(col(column).isNull().cast("int")).alias(column) for column in df.columns])

# Hiển thị kết quả
null_counts.show()

+----+--------+-------+----------+----------+----------+------------+-------+-------+--------------+--------+--------+-----+
|asin|   image|overall|reviewText|reviewTime|reviewerID|reviewerName|  style|summary|unixReviewTime|verified|    vote|title|
+----+--------+-------+----------+----------+----------+------------+-------+-------+--------------+--------+--------+-----+
|   0|51158841|      0|     13834|         0|         0|        1842|1787079|  13922|             0|       0|40882790| 2590|
+----+--------+-------+----------+----------+----------+------------+-------+-------+--------------+--------+--------+-----+



Bỏ các giá trị null trong reviewText. Vì có 2 feature quan trọng nhất đó là overall và reviewText

In [9]:
# Loại bỏ các dòng có giá trị null trong cột 'reviewText'
df = df.na.drop(subset=['reviewText'])

In [10]:
from pyspark.sql.functions import col, expr

# Chuyển đổi định dạng cột 'style' từ struct sang string
df = df.withColumn('style', expr('CONCAT_WS(", ", style.*)'))

# Kiểm tra các column quan trọng

In [11]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- image: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- style: string (nullable = false)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- vote: string (nullable = true)
 |-- title: string (nullable = true)



In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count
value_counts = df.groupBy("overall").agg(count("*").alias("count"))
value_counts.show()

+-------+--------+
|overall|   count|
+-------+--------+
|    1.0| 2089496|
|    4.0| 9561674|
|    3.0| 3837365|
|    2.0| 1851737|
|    5.0|33989247|
|    0.0|       8|
+-------+--------+



In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count
value_counts = df.groupBy('verified').agg(count("*").alias("count"))
value_counts.show()

+--------+--------+
|verified|   count|
+--------+--------+
|    true|34767314|
|   false|16562213|
+--------+--------+



In [14]:
from pyspark.sql.functions import countDistinct
unique_count = df.agg(countDistinct("reviewTime").alias("count")).collect()[0]["count"]
print(unique_count)

7837


### tách Ngày Tháng Năm

In [15]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [16]:
from pyspark.sql.functions import to_date, dayofmonth, month, year

# Chuyển đổi cột "reviewTime" sang định dạng ngày tháng
df = df.withColumn("reviewTime", to_date(df["reviewTime"], "MM dd, yyyy"))

# Tạo các cột "day", "month", và "year" từ cột "reviewTime"
df = df.withColumn("day", dayofmonth(df["reviewTime"]))
df = df.withColumn("month", month(df["reviewTime"]))
df = df.withColumn("year", year(df["reviewTime"]))


In [17]:
from pyspark.sql.functions import countDistinct

# Tính số lượng giá trị duy nhất trong cột "reviewerID"
unique_count = df.agg(countDistinct("reviewerID").alias("unique_count"))

# Lấy giá trị số lượng duy nhất
result = unique_count.collect()[0]["unique_count"]

# Hiển thị kết quả
print(result)

15356625


In [18]:
from pyspark.sql.functions import countDistinct

# Tính số lượng giá trị duy nhất trong cột 'reviewerName'
unique_count = df.agg(countDistinct('reviewerName').alias("unique_count"))

# Lấy giá trị số lượng duy nhất
result = unique_count.collect()[0]["unique_count"]

# Hiển thị kết quả
print(result)

8324006


Ta thấy reviewname < reviewID nên có thể có nhiều người trùng tên với nhau, vì vậy mình sẽ bỏ đi cột reviewName.

In [19]:
# Chọn tất cả các cột trừ cột 'reviewerName'
df = df.select([col for col in df.columns if col != 'reviewerName'])

In [20]:
from pyspark.sql.functions import countDistinct

# Tính số lượng giá trị duy nhất trong cột "asin"
unique_count = df.agg(countDistinct("asin").alias("unique_count"))

# Lấy giá trị số lượng duy nhất
result = unique_count.collect()[0]["unique_count"]

# Hiển thị kết quả
print(result)

2930225


### 'style' Column

In [21]:
from pyspark.sql import functions as F

# Nhóm theo cột 'style' và đếm số lần xuất hiện của mỗi giá trị
result = df.groupBy('style').count()

# Hiển thị kết quả
result.show()

Py4JJavaError: An error occurred while calling o221.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 94.0 failed 1 times, most recent failure: Lost task 2.0 in stage 94.0 (TID 3457) (LTC executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
	at org.apache.spark.io.ReadAheadInputStream.<init>(ReadAheadInputStream.java:105)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.<init>(UnsafeSorterSpillReader.java:74)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.getReader(UnsafeSorterSpillWriter.java:159)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.getSortedIterator(UnsafeExternalSorter.java:555)
	at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:172)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:779)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2383/635562420.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
	at org.apache.spark.io.ReadAheadInputStream.<init>(ReadAheadInputStream.java:105)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.<init>(UnsafeSorterSpillReader.java:74)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.getReader(UnsafeSorterSpillWriter.java:159)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.getSortedIterator(UnsafeExternalSorter.java:555)
	at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:172)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:779)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2383/635562420.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)


Lọc ra cái style ????

In [26]:
from pyspark.sql.functions import col

df = df.filter(col("style") == " Kindle Edition")

# Hiển thị kết quả
df.show()



+----------+-----+-------+--------------------+----------+--------------+---------------+--------------------+--------------+--------+----+-----+---+-----+----+
|      asin|image|overall|          reviewText|reviewTime|    reviewerID|          style|             summary|unixReviewTime|verified|vote|title|day|month|year|
+----------+-----+-------+--------------------+----------+--------------+---------------+--------------------+--------------+--------+----+-----+---+-----+----+
|B000FA5KKA| null|    5.0|Best sci-fi novel...|2017-07-05|A1LC8JBYBO82AA| Kindle Edition|A.K. Barnes at hi...|    1499212800|    true|null|     |  5|    7|2017|
|B000FA5KKA| null|    5.0|Arthur K. Barnes ...|2016-04-22|A1V070P3VG7XEM| Kindle Edition|Classic SF at its...|    1461283200|    true|null|     | 22|    4|2016|
|B000FA5KKA| null|    5.0|         Great story|2016-01-31| A67ZKMMBKOP24| Kindle Edition|          Five Stars|    1454198400|    true|null|     | 31|    1|2016|
|B000FA5KKA| null|    5.0|Good stu

                                                                                

In [27]:
df.count()

                                                                                

5043778

In [28]:
from pyspark.sql.functions import countDistinct

# Tính số lượng giá trị duy nhất trong cột "summary"
unique_count = df.select(countDistinct("summary")).first()[0]

# In kết quả
print(unique_count)



2600164


                                                                                

In [29]:
first_summary = df.select(col("summary")).first()[0]
# In kết quả
print(first_summary)

[Stage 161:>                                                        (0 + 8) / 8]

Love the WB way of life


                                                                                

In [30]:
first_review = df.select(col('reviewText')).first()[0]
# In kết quả
print(first_review)

[Stage 166:>                                                        (0 + 8) / 8]

Excellent way of life that everyone can follow and succeed in.


                                                                                

Cột summary giữ lại để làm phần RS

In [31]:
from pyspark.sql.functions import col

# Tính số lượng giá trị duy nhất trong cột "vote" và lấy các giá trị phổ biến nhất
vote_counts = df.groupBy("vote").count().orderBy(col("count").desc())

# Kiểm tra nếu DataFrame không rỗng
if vote_counts.count() > 0:
    # Lấy 5 giá trị phổ biến nhất
    top_votes = vote_counts.head(5)

    # In kết quả
    for vote, count in top_votes:
        print(vote, count)
else:
    print("Không có dữ liệu hoặc không có giá trị phổ biến trong cột 'vote'.")


23/07/15 22:17:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:17:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:17:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:17:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:17:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:17:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:17:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:17:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

None 4414873
2 288321
3 122719
4 64748
5 38274


                                                                                

In [32]:
from pyspark.sql.functions import col

# Tính số lượng giá trị duy nhất trong cột "image" và lấy các giá trị phổ biến nhất
image_counts = df.groupBy("image").count().orderBy(col("count").desc())

# Kiểm tra nếu DataFrame không rỗng
if image_counts.count() > 0:
    # Lấy 5 giá trị phổ biến nhất
    top_images = image_counts.head(5)

    # In kết quả
    for image, count in top_images:
        print(image, count)
else:
    print("Không có dữ liệu hoặc không có giá trị phổ biến trong cột 'image'.")

[Stage 211:>                                                        (0 + 8) / 8]

None 5039911
['https://images-na.ssl-images-amazon.com/images/I/51-Fi5BvzbL._SY88.jpg'] 9
['https://images-na.ssl-images-amazon.com/images/I/71eN5U4UQoL._SY88.jpg'] 7
['https://images-na.ssl-images-amazon.com/images/I/61Rr6pfHbeL._SY88.jpg'] 7
['https://images-na.ssl-images-amazon.com/images/I/01+nEU0JBaL._SY88.jpg'] 6


                                                                                

Bỏ cột image với vote vì có quá nhiều null.

In [33]:
df = df.drop("image", "vote")

Mình cũng sẽ bỏ luôn cột unixReviewTime vì mình đã có đầy đủ dữ kiện về thời gian

In [34]:
df = df.drop('unixReviewTime')

In [35]:
df = df.withColumn("overall", col("overall").cast("integer"))

In [37]:
overall_counts = df.groupBy("overall").count()

# In kết quả
overall_counts.show()

23/07/15 22:18:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+-------+-------+
|overall|  count|
+-------+-------+
|      1| 214901|
|      3| 447279|
|      5|3051786|
|      4|1139110|
|      2| 190701|
|      0|      1|
+-------+-------+



                                                                                

In [38]:
df = df.filter(col("overall") != 0)

Để phân loại các đánh giá tích cực hoặc tiêu cực, mình sẽ gom 4 và 5 cùng nhau thành tích cực và mã hóa chúng thành 2.Mã hóa xếp hạng 3 thành 0 và xếp hạng 1 và 2 thành 1.

In [39]:
def calc_sentiment_with_neutral(overall):
    '''encoding the sentiments of the ratings.'''
    if overall == 5 or overall == 4:
        return 2
    elif overall == 1 or overall==2 : 
        return 0
    else:
        return 1

In [40]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Định nghĩa hàm udf từ hàm calc_sentiment_with_neutral
calc_sentiment_udf = udf(calc_sentiment_with_neutral, IntegerType())

# Áp dụng hàm calc_sentiment_with_neutral lên cột "overall" và tạo cột "sentiment"
df = df.withColumn("sentiment", calc_sentiment_udf("overall"))

In [41]:
df.show()

[Stage 229:>                                                        (0 + 1) / 1]

+----------+-------+--------------------+----------+--------------+---------------+--------------------+--------+-----+---+-----+----+---------+
|      asin|overall|          reviewText|reviewTime|    reviewerID|          style|             summary|verified|title|day|month|year|sentiment|
+----------+-------+--------------------+----------+--------------+---------------+--------------------+--------+-----+---+-----+----+---------+
|B000FA5KKA|      5|Best sci-fi novel...|2017-07-05|A1LC8JBYBO82AA| Kindle Edition|A.K. Barnes at hi...|    true|     |  5|    7|2017|        2|
|B000FA5KKA|      5|Arthur K. Barnes ...|2016-04-22|A1V070P3VG7XEM| Kindle Edition|Classic SF at its...|    true|     | 22|    4|2016|        2|
|B000FA5KKA|      5|         Great story|2016-01-31| A67ZKMMBKOP24| Kindle Edition|          Five Stars|    true|     | 31|    1|2016|        2|
|B000FA5KKA|      5|Good stuff. well ...|2014-04-04| ASOP1MX20LD8K| Kindle Edition|Gerry Carlyle is ...|    true|     |  4|    4|2

                                                                                

In [43]:
sentiment_counts = df.groupBy("sentiment").count()
# In kết quả
sentiment_counts.show()

23/07/15 22:18:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:18:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:18:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:18:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:18:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:18:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:18:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:18:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/07/15 22:18:52 WARN RowBasedKeyValueBatch: Calling spill() on

+---------+-------+
|sentiment|  count|
+---------+-------+
|        1| 447279|
|        2|4190896|
|        0| 405602|
+---------+-------+



                                                                                

# Review Column

In [44]:
from pyspark.sql.functions import lower
# Chuyển đổi chữ cái trong cột "reviewText" thành chữ thường
df = df.withColumn("reviewText", lower(col("reviewText")))

In [45]:
df.select("reviewText").show(truncate=False)

[Stage 249:>                                                        (0 + 8) / 8]

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|reviewText                                                                                                                                                                                                                                                     

                                                                                

In [46]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [47]:
def remove_punctuation(text):
    text_no_punc = "".join([c for c in text if c not in string.punctuation])
    return text_no_punc

In [48]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


# Định nghĩa hàm udf từ hàm remove_punctuation
remove_punc_udf = udf(remove_punctuation, StringType())

# Áp dụng hàm remove_punctuation lên cột "reviewText" và tạo cột mới "review_clean"
df = df.withColumn("review_clean", remove_punc_udf("reviewText"))
# In một số hàng đầu của DataFrame
df.show(5)



+----------+-------+--------------------+----------+--------------+---------------+--------------------+--------+-----+---+-----+----+---------+--------------------+
|      asin|overall|          reviewText|reviewTime|    reviewerID|          style|             summary|verified|title|day|month|year|sentiment|        review_clean|
+----------+-------+--------------------+----------+--------------+---------------+--------------------+--------+-----+---+-----+----+---------+--------------------+
|B000FA5KKA|      5|best sci-fi novel...|2017-07-05|A1LC8JBYBO82AA| Kindle Edition|A.K. Barnes at hi...|    true|     |  5|    7|2017|        2|best scifi novel ...|
|B000FA5KKA|      5|arthur k. barnes ...|2016-04-22|A1V070P3VG7XEM| Kindle Edition|Classic SF at its...|    true|     | 22|    4|2016|        2|arthur k barnes w...|
|B000FA5KKA|      5|         great story|2016-01-31| A67ZKMMBKOP24| Kindle Edition|          Five Stars|    true|     | 31|    1|2016|        2|         great story|
|B00

                                                                                

In [49]:
df.write.json("cleaned_data.json")



AnalysisException: [PATH_ALREADY_EXISTS] Path file:/Users/phongminh/kindle data/notebooks/cleaned_data.json already exists. Set mode as "overwrite" to overwrite the existing path.