Before we start, we need to make sure that we have a Kafka cluster running and a topic that produces some streaming data. For simplicity, we will use a single-node Kafka cluster and a topic named `events`. Open the `4.0 events-gen-kafka.ipynb` notebook and execute the cell. This notebook produces an event record every second and put it on a Kafka topic called `events`. 

In [14]:
from delta import  # Delta Lake 라이브러리 임포트 configure_spark_with_delta_pip
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 col, from_json, window, count, to_timestamp
from pyspark.sql.types import  # Spark SQL 데이터 타입 임포트 StructType, StructField, IntegerType, StringType

In [15]:
builder = (SparkSession.builder  # SparkSession 빌더 패턴 시작
           .appName("apply-window-aggregations")  # 애플리케이션 이름 설정
           .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
           .config("spark.executor.memory", "512m")  # Spark 설정 옵션
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")  # Spark 설정 옵션
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")  # Spark 설정 옵션)

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()  # SparkSession 생성 또는 기존 세션 반환
spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

In [16]:
df = (spark.readStream  # 스트리밍 데이터 읽기
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "events")
      .option("startingOffsets", "earliest")
      .load(  # 파일 로드))

In [17]:
schema = StructType  # 구조체 타입([
    StructField  # 구조체 필드('user_id', IntegerType(), True),
    StructField  # 구조체 필드('event_type', StringType(), True),
    StructField  # 구조체 필드('event_time', StringType(), True),
    StructField  # 구조체 필드('processing_time', StringType(), True)])

df = df.withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정'value', from_json(col('value').cast("STRING"), schema))

In [18]:
df = (df
      .select(  # 컬럼 선택
          col(  # 컬럼 참조'value.user_id').alias('user_id'),
          col(  # 컬럼 참조'value.event_type').alias('event_type'),
          col(  # 컬럼 참조'value.event_time').alias('event_time'),
          col(  # 컬럼 참조'value.processing_time').alias('processing_time'))
      .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"event_time"
        , to_timestamp(col(  # 컬럼 참조"event_time")
        , "MM/dd/yyyy, HH:mm:ss" ))
      .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"processing_time"
        , to_timestamp(col(  # 컬럼 참조"processing_time")
        , "MM/dd/yyyy, HH:mm:ss")))

In [19]:
df = (df.groupBy(  # 그룹화
    window(col(  # 컬럼 참조"event_time"), "60 minute", "60 minute")
    , col(  # 컬럼 참조"event_type"))
      .agg(count(col(  # 컬럼 참조"user_id")).alias("NumberOfUsers")))

In [20]:
query = (df.writeStream  # 스트리밍 데이터 쓰기
    .outputMode(  # 스트리밍 출력 모드 설정'complete')
    .format('console')
    .option("truncate", False)
    .start()  # 스트리밍 시작)

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|view      |4            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|click     |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|purchase  |4            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|like      |8            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|share     |2            |
+------------------------------------------+----------+-------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|view      |4            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|click     |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|purchase  |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|like      |8            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|share     |3            |
+------------------------------------------+----------+-------------+



[Stage 4:>                                                          (0 + 1) / 1]

In [21]:
query.stop()

24/02/04 18:15:30 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 2, writer: ConsoleWriter[numRows=20, truncate=false]] is aborting.
24/02/04 18:15:30 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 2, writer: ConsoleWriter[numRows=20, truncate=false]] aborted.


In [22]:
# Update output mode 
query = (df.writeStream  # 스트리밍 데이터 쓰기.outputMode("update") 
    .format("console") 
    .option("truncate", False) 
    .start()  # 스트리밍 시작)

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|view      |4            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|click     |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|purchase  |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|like      |8            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|share     |4            |
+------------------------------------------+----------+-------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|like      |9            |
+------------------------------------------+----------+-------------+





In [25]:
# # Append output mode 
# query = (df.writeStream  # 스트리밍 데이터 쓰기.outputMode("append") 
#     .format("console") 
#     .option("truncate", False) 
#     .start()  # 스트리밍 시작)

In [26]:
query.stop()

In [27]:
spark.stop()  # Spark 세션 종료 - 리소스 정리 