In [None]:
from pyspark.sql.functions import col, from_json, sum, window
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, sum, window, to_timestamp


spark = SparkSession.builder \
    .appName("StreamingApp") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0") \
    .getOrCreate()



In [None]:

# Định nghĩa schema cho dữ liệu JSON
schema = StructType([
   StructField("transaction_id", StringType(), True),
   StructField("user_id", StringType(), True), 
   StructField("amount", DoubleType(), True),
   StructField("timestamp", TimestampType(), True)
])

# Đọc dữ liệu từ Kafka và xử lý
streaming_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "ecommerce_topic") \
    .load()

In [None]:
checkpoint_dir = "/tmp/quickcommerce_streaming_checkpoint"

In [None]:
# Create and start streaming query
query = streaming_df.writeStream \
    .format("console") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_dir) \
    .start()

In [None]:
streaming_df = streaming_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

In [None]:
streaming_query = streaming_df.writeStream\
.trigger(processingTime = "10 seconds") \
.option("checkpointLocation", "/tmp/trigger_checkpoint")\
.format("console")\
.start()

In [None]:

# Chuyển đổi timestamp thành đúng định dạng
streaming_df = streaming_df.withColumn("timestamp", 
    col("timestamp").cast(TimestampType()))

# Sau đó mới thêm watermark
streaming_df = streaming_df.withWatermark("timestamp", "5 minute")
# Filter transactions greater than $300
filtered_df = streaming_df.filter("amount > 300")

# Group by user_id and calculate total amount per user
aggregated_df = filtered_df.groupBy("user_id").agg(sum("amount").alias("total_amount"))

# Write aggregated data to console for testing
aggregated_df.writeStream\
    .format("console")\
    .outputMode("complete")\
    .start()

 
# Calculate total amount per user in 10-minute windows
windowed_df = filtered_df.groupBy(
    window(col("timestamp"), "10 minutes"), 
    col("user_id")
).agg(sum(col("amount")).alias("total_amount"))


# Write windowed data to JSON files
windowed_df.writeStream\
    .format("json")\
    .option("path", "/tmp/late_data")\
    .option("checkpointLocation", "/tmp/late_data_checkpoint")\
    .start()

In [None]:
filtered_df = streaming_df.filter("amount > 1000")


In [None]:
from pyspark.sql.functions import col, when

classified_df = filtered_df.withColumn(
    "classification", 
    when(col("amount") >= 5000, "very high value")
    .when(col("amount") >= 3000, "high value")
    .otherwise("low value")  # Ensure fallback condition
)


In [None]:
classified_df.writeStream \
    .format("console") \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/classified_checkpoint") \
    .start() \
    .awaitTermination()


In [None]:
#calculate the total transaction amount for each user, updated it in real-time
aggregated_df = classified_df.groupBy("user_id").agg(sum("amount").alias("total_amount"))

In [None]:
#register streaming data frame as a temporary SQL table
classified_df.createOrReplaceTempView("transactions")

In [None]:
query = """
SELECT user_id, 
       SUM(amount) AS total_spent, 
       classification
FROM transactions
WHERE amount > 10000
GROUP BY user_id, classification
ORDER BY total_spent DESC 
"""
result_df = spark.sql(query)
query = result_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, sum, window, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Khởi tạo Spark session
spark = SparkSession.builder \
    .appName("QuickCommerce Streaming Pipeline") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .getOrCreate()

# Định nghĩa schema cho dữ liệu JSON
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("timestamp", StringType(), True)
])

# Đọc dữ liệu từ Kafka
streaming_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "ecommerce_topic") \
    .load()

# Chuyển đổi dữ liệu Kafka thành DataFrame
streaming_df = streaming_df.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.*")

# Chuyển đổi kiểu dữ liệu
streaming_df = streaming_df.withColumn("amount", col("amount").cast("double"))
streaming_df = streaming_df.withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy-MM-dd HH:mm:ss"))

# Lọc giao dịch có giá trị cao
streaming_df = streaming_df.withWatermark("timestamp", "5 minutes")
filtered_df = streaming_df.filter(col("amount") > 1000)

# Tổng hợp dữ liệu theo cửa sổ 10 phút
windowed_df = filtered_df.groupBy(
    window(col("timestamp"), "10 minutes"),
    col("user_id")
).agg(sum("amount").alias("total_amount"))

# Ghi dữ liệu vào Parquet
query = windowed_df.writeStream \
    .format("parquet") \
    .option("path", "/tmp/high_value_transactions") \
    .option("checkpointLocation", "/tmp/high_value_checkpoint") \
    .outputMode("append") \
    .start()

query.awaitTermination()
