Before we start, we need to make sure that we have a Kafka cluster running and a topic that produces some streaming data. For simplicity, we will use a single-node Kafka cluster and a topic named `users`. Open the `5.0 user-gen-kafka.ipynb` notebook and execute the cell. This notebook produces a user record every few seconds and put it on a Kafka topic called `users`. 

In [26]:
from delta import  # Delta Lake 라이브러리 임포트 configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 col, from_json
from pyspark.sql.types import  # Spark SQL 데이터 타입 임포트 StructType, StructField, IntegerType, StringType

In [27]:
builder = (SparkSession.builder  # SparkSession 빌더 패턴 시작
           .appName("merge-cdc-streaming")  # 애플리케이션 이름 설정
           .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
           .config("spark.executor.memory", "512m")  # Spark 설정 옵션
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")  # Spark 설정 옵션
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")  # Spark 설정 옵션)

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()  # SparkSession 생성 또는 기존 세션 반환
spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

In [28]:
get_ipython().run_line_magic('load_ext', 'sparksql_magic')
get_ipython().run_line_magic('config', 'SparkSql.limit=20')

The sparksql_magic extension is already loaded. To reload it, use:
  %reload_ext sparksql_magic


In [29]:
%%sparksql
CREATE OR REPLACE TABLE default.users (
    id INT,
    name STRING,
    age INT,
    gender STRING,
    country STRING 
) USING DELTA  # Delta Lake 테이블 생성 LOCATION '/opt/workspace/data/delta_lake/merge-cdc-streaming/users';

                                                                                

In [30]:
df = (spark.readStream  # 스트리밍 데이터 읽기
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load(  # 파일 로드))

In [31]:
schema = StructType  # 구조체 타입([
    StructField  # 구조체 필드('id', IntegerType(), True),
    StructField  # 구조체 필드('name', StringType(), True),
    StructField  # 구조체 필드('age', IntegerType(), True),
    StructField  # 구조체 필드('gender', StringType(), True),
    StructField  # 구조체 필드('country', StringType(), True)])

df = df.withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정'value', from_json(col('value').cast("STRING"), schema))

In [32]:
df = df.select(  # 컬럼 선택
    col(  # 컬럼 참조'value.id').alias('id'),
    col(  # 컬럼 참조'value.name').alias('name'),
    col(  # 컬럼 참조'value.age').alias('age'),
    col(  # 컬럼 참조'value.gender').alias('gender'),
    col(  # 컬럼 참조'value.country').alias('country'))

In [33]:
def upsertToDelta(microBatchDf, batchId):
    deltaTable = DeltaTable  # Delta 테이블 작업을 위한 클래스.forPath(spark, "/opt/workspace/data/delta_lake/merge-cdc-streaming/users" )
    (deltaTable.alias(  # 컬럼 별칭 설정"dt")
     .merge(source=microBatchDf.alias(  # 컬럼 별칭 설정"sdf"),
          condition="sdf.id = dt.id")
     .whenMatchedUpdate(set={
         "id": "sdf.id",
         "name": "sdf.name",
         "age": "sdf.gender",
         "country": "sdf.country"
     })
     .whenNotMatchedInsert(values={
         "id": "sdf.id",
         "name": "sdf.name",
         "age": "sdf.gender",
         "country": "sdf.country"
     })
    .execute())

In [34]:
query = (df.writeStream  # 스트리밍 데이터 쓰기
         .format("delta")  # Delta Lake 형식으로 저장
         .foreachBatch(upsertToDelta)
         .outputMode(  # 스트리밍 출력 모드 설정"update")
         .option("checkpointLocation", "/opt/workspace/data/delta_lake/merge-cdc-streaming/users/_checkpoints/")
         .start("/opt/workspace/data/delta_lake/merge-cdc-streaming/users"))

                                                                                

In [35]:
%%sparksql
DESCRIBE HISTORY delta.`/opt/workspace/data/delta_lake/merge-cdc-streaming/users`;

                                                                                

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
7,2024-02-04 18:39:29.470000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""(id#1869 = id#5736)""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,6,Serializable,False,"{'numOutputRows': '175', 'numTargetBytesAdded': '2537', 'numTargetRowsInserted': '0', 'numTargetFilesAdded': '1', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesRemoved': '1', 'numTargetRowsMatchedUpdated': '2', 'executionTimeMs': '2726', 'numTargetRowsCopied': '173', 'rewriteTimeMs': '646', 'numTargetRowsUpdated': '2', 'numTargetRowsDeleted': '0', 'scanTimeMs': '2030', 'numSourceRows': '1', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '2537'}",,Apache-Spark/3.4.1 Delta-Lake/2.4.0
6,2024-02-04 18:39:20.134000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""(id#1869 = id#4971)""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,5,Serializable,False,"{'numOutputRows': '175', 'numTargetBytesAdded': '2537', 'numTargetRowsInserted': '0', 'numTargetFilesAdded': '1', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesRemoved': '1', 'numTargetRowsMatchedUpdated': '1', 'executionTimeMs': '2531', 'numTargetRowsCopied': '174', 'rewriteTimeMs': '549', 'numTargetRowsUpdated': '1', 'numTargetRowsDeleted': '0', 'scanTimeMs': '1949', 'numSourceRows': '1', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '2541'}",,Apache-Spark/3.4.1 Delta-Lake/2.4.0
5,2024-02-04 18:39:08.867000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""(id#1869 = id#4206)""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,4,Serializable,False,"{'numOutputRows': '175', 'numTargetBytesAdded': '2541', 'numTargetRowsInserted': '0', 'numTargetFilesAdded': '1', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesRemoved': '1', 'numTargetRowsMatchedUpdated': '2', 'executionTimeMs': '2153', 'numTargetRowsCopied': '173', 'rewriteTimeMs': '546', 'numTargetRowsUpdated': '2', 'numTargetRowsDeleted': '0', 'scanTimeMs': '1579', 'numSourceRows': '1', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '2544'}",,Apache-Spark/3.4.1 Delta-Lake/2.4.0
4,2024-02-04 18:38:59.384000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""(id#1869 = id#3441)""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,3,Serializable,False,"{'numOutputRows': '175', 'numTargetBytesAdded': '2544', 'numTargetRowsInserted': '0', 'numTargetFilesAdded': '1', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesRemoved': '1', 'numTargetRowsMatchedUpdated': '2', 'executionTimeMs': '2485', 'numTargetRowsCopied': '173', 'rewriteTimeMs': '844', 'numTargetRowsUpdated': '2', 'numTargetRowsDeleted': '0', 'scanTimeMs': '1620', 'numSourceRows': '1', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '2551'}",,Apache-Spark/3.4.1 Delta-Lake/2.4.0
3,2024-02-04 18:38:53.107000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""(id#1869 = id#2676)""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,2,Serializable,False,"{'numOutputRows': '175', 'numTargetBytesAdded': '2551', 'numTargetRowsInserted': '1', 'numTargetFilesAdded': '1', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesRemoved': '1', 'numTargetRowsMatchedUpdated': '8', 'executionTimeMs': '6020', 'numTargetRowsCopied': '166', 'rewriteTimeMs': '1980', 'numTargetRowsUpdated': '8', 'numTargetRowsDeleted': '0', 'scanTimeMs': '4015', 'numSourceRows': '5', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '2544'}",,Apache-Spark/3.4.1 Delta-Lake/2.4.0
2,2024-02-04 18:38:42.222000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""(id#1869 = id#1899)""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,1,Serializable,False,"{'numOutputRows': '174', 'numTargetBytesAdded': '2544', 'numTargetRowsInserted': '174', 'numTargetFilesAdded': '1', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesRemoved': '0', 'numTargetRowsMatchedUpdated': '0', 'executionTimeMs': '6415', 'numTargetRowsCopied': '0', 'rewriteTimeMs': '2397', 'numTargetRowsUpdated': '0', 'numTargetRowsDeleted': '0', 'scanTimeMs': '3944', 'numSourceRows': '174', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '0'}",,Apache-Spark/3.4.1 Delta-Lake/2.4.0
1,2024-02-04 18:38:29.574000,,,CREATE OR REPLACE TABLE,"{'description': None, 'partitionBy': '[]', 'properties': '{}', 'isManaged': 'false'}",,,,0,Serializable,False,{},,Apache-Spark/3.4.1 Delta-Lake/2.4.0
0,2024-02-04 18:37:41.889000,,,CREATE OR REPLACE TABLE,"{'description': None, 'partitionBy': '[]', 'properties': '{}', 'isManaged': 'false'}",,,,,Serializable,True,{},,Apache-Spark/3.4.1 Delta-Lake/2.4.0


                                                                                

In [36]:
query.stop()

In [37]:
spark.stop()  # Spark 세션 종료 - 리소스 정리 