In [1]:
import os
# Tránh lỗi python3 trên Windows
os.environ["PYSPARK_PYTHON"] = "python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python"

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("AmazonReviewPreprocess") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()
    
meta_path = r"file:///C:/Users/PC/OneDrive/Dokumen/Amazon_sales_forecasting/data/metadata/meta_Clothing_Shoes_and_Jewelry_*.parquet"
review_path = r"D:/Clothing_Shoes_and_Jewelry/Clothing_Shoes_and_Jewelry.jsonl"

df_meta = spark.read.parquet(meta_path)
df_review = spark.read.json(review_path)

In [2]:
from pyspark.sql.functions import to_date, from_unixtime, col

# Chuyển timestamp sang định dạng ngày
df_review = df_review.withColumn("date", to_date(from_unixtime(col("timestamp") / 1000)))

# Chọn các cột quan trọng
df_review = df_review.select("parent_asin", "rating", "text", "user_id", "date")


cols_needed = [
        "parent_asin", "title", "price", "average_rating", "rating_number",
        "categories", "features", "description", "main_category", "store"
    ]
df_meta = df_meta.select([c for c in cols_needed if c in df_meta.columns])

In [3]:
from pyspark.sql.functions import col, pandas_udf, when
import pandas as pd
from transformers import pipeline

# # Kết hợp dữ liệu review và metadata
df = df_review.join(df_meta, on="parent_asin", how="left")

# Tải mô hình phân tích cảm xúc từ HuggingFace
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Định nghĩa hàm phân tích cảm xúc
@pandas_udf("int")
def compute_sentiment(text_series: pd.Series) -> pd.Series:
    results = sentiment_pipeline(text_series.tolist(), truncation=True)
    return pd.Series([1 if r['label'] == 'POSITIVE' else -1 for r in results])

# Áp dụng hàm phân tích cảm xúc
df = df.withColumn("sentiment_score", compute_sentiment(df["text"]))

# Chuyển cột verified_purchase từ chuỗi sang boolean

df.printSchema()

# Lọc những dòng có verified_purchase=True và text không null
df = df.filter((col("text").isNotNull()))

df.show(5)

# Save to csv file
pandas_df = df.limit(10000).toPandas()

pandas_df.to_csv("tft_10000.csv")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


root
 |-- parent_asin: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- title: string (nullable = true)
 |-- price: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- main_category: string (nullable = true)
 |-- store: string (nullable = true)
 |-- sentiment_score: integer (nullable = true)

+-----------+------+--------------------+--------------------+----------+--------------------+-----+--------------+-------------+--------------------+--------------------+--------------------+--------------+------------+---------------+
|parent_

# Xây dựng và huấn luyện mô hình Temporal Fusion Transformer (TFT)