In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
from pyspark.sql.functions import col, from_json, sum, window

#initialize spark sesion
spark = SparkSession.builder\
.appName("Quick Commerce Streaming Pipeline")\
.config("spark.jar.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0")\
.getOrCreate()
#connect to our kafka topic, subcribe kafka topic to read data streaming
streaming_df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "ecommerce_topic") \
.load()

from pyspark.sql.functions import col, from_json, to_timestamp

# Convert Kafka message value from binary to a JSON string
streaming_df = streaming_df.selectExpr("CAST(value AS STRING) as json") \
    .selectExpr("from_json(json, 'transaction_id STRING, user_id STRING, amount DOUBLE, timestamp STRING') AS data") \
    .select("data.*")

# Convert the "amount" column from string to double for numerical operations
streaming_df = streaming_df.withColumn("amount", col("amount").cast("double"))

# Convert the "timestamp" column from string format to Spark's TimestampType
streaming_df = streaming_df.withColumn("timestamp", to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss"))

# ensure any data arrive after 5 mins won't be precessed
streaming_df = streaming_df.withWaterMark("timestamp", "5 minutes")

filtered_df = streaming_df.filter("amount > 1000")

# Group transactions into 10-minute windows per user and calculate the total amount spent
windowed_df = filtered_df.groupBy(
    window(col("timestamp"), "10 minutes"),
    col("user_id")).agg(
    sum(col("amount")).alias("total_amount"))


# Write the aggregated transaction data to Parquet format for efficient storage and fast querying.
# The output is stored in "/tmp/high_value_transactions" with a checkpoint at "/tmp/high_value_checkpoint".
# Checkpointing prevents duplicate processing and ensures fault tolerance.
# "append" mode ensures new data is continuously added without overwriting existing records.
# The stream starts and runs indefinitely, processing incoming data in real-time.
query = windowed_df.writeStream \
    .format("parquet") \
    .option("path", "/tmp/high_value_transactions") \
    .option("checkpointLocation", "/tmp/high_value_checkpoint") \
    .outputMode("append") \
    .start()

query.awaitTermination()




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/08 14:30:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.