In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, get_json_object
import logging
from pyspark.sql.functions import col, count,desc
from pyspark.sql.window import Window

# Cấu hình logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
spark = SparkSession.builder \
    .appName("ExtractOrder") \
    .getOrCreate()

In [3]:

# Đọc file CSV
df = spark.read.csv("dataset_with_name.csv", header=True, inferSchema=True)



In [4]:
df.show(5)

+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|                name|
+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+--------------------+
|2019-11-16 11:54:03|      view|   1004858|2053013555631882655|electronics.smart...|samsung|128.42|557027114|56c4fa27-2298-400...|Smartphone Samsun...|
|2019-11-05 09:36:31|      view|   1004872|2053013555631882655|electronics.smart...|samsung|270.23|513209166|56b2bf0c-18c7-4b4...|Smartphone Samsun...|
|2019-11-16 13:21:15|      view|   2702053|2053013563911439225|appliances.kitche...|     lg|516.33|564970140|ac97a1c3-8412-4b6...|Refrigerators Lg ...|
|2019-11-17 13:40:34|      cart|   1802037|2053013554415534427|electronics.video.tv|    

In [5]:
df.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- name: string (nullable = true)



In [6]:
# Lọc chỉ lấy các event có event_type = 'purchased'
purchased_df = df.filter(col("event_type") == "purchase")

# Đếm số lượt mua của từng user_id
user_purchase_counts = purchased_df.groupBy("user_id").agg(count("*").alias("purchase_count"))

# Lấy top 3 user có số lượt mua nhiều nhất
top_3_users = user_purchase_counts.orderBy(desc("purchase_count")).limit(3)

# Lấy danh sách user_id của top 3 người dùng
top_user_ids = [row["user_id"] for row in top_3_users.collect()]

# Lọc lại toàn bộ dữ liệu của các user này
top_users_data = purchased_df.filter(col("user_id").isin(top_user_ids))

# Hiển thị kết quả
top_users_data.show(truncate=False)

+-------------------+----------+----------+-------------------+--------------------------------------+-------+-------+---------+------------------------------------+-----------------------------+
|event_time         |event_type|product_id|category_id        |category_code                         |brand  |price  |user_id  |user_session                        |name                         |
+-------------------+----------+----------+-------------------+--------------------------------------+-------+-------+---------+------------------------------------+-----------------------------+
|2019-11-07 13:14:35|purchase  |1004833   |2053013555631882655|electronics.smartphone                |samsung|169.94 |564068124|cbbf5bc0-5b8b-4168-834f-c5190eabf3e5|Smartphone Samsung 1004833   |
|2019-11-02 15:54:04|purchase  |1004767   |2053013555631882655|electronics.smartphone                |samsung|241.97 |564068124|3b00665a-daff-4a2c-bba2-a152cc6e62c9|Smartphone Samsung 1004767   |
|2019-11-02 19:52:43