Before we start, we need to make sure that we have a Kafka cluster running and a topic that produces some streaming data. For simplicity, we will use a single-node Kafka cluster and a topic named orders. Open the `5.0 orders-gen-kafka.ipynb` notebook and execute the cell. This notebook simulates streaming data of online orders, which contains the order ID, the product ID, the quantity, and the timestamp. 

In [5]:
from delta import  # Delta Lake 라이브러리 임포트 configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 col, from_json,to_timestamp
from pyspark.sql.types import  # Spark SQL 데이터 타입 임포트 StructType, StructField, IntegerType, StringType

In [2]:
builder = (SparkSession.builder  # SparkSession 빌더 패턴 시작
           .appName("joining-stream-static-data")  # 애플리케이션 이름 설정
           .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
           .config("spark.executor.memory", "512m")  # Spark 설정 옵션
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")  # Spark 설정 옵션
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")  # Spark 설정 옵션)

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()  # SparkSession 생성 또는 기존 세션 반환
spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f4a9e442-56e1-4b01-bc24-a873d6fa472c;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in centra

In [3]:
get_ipython().run_line_magic('load_ext', 'sparksql_magic')
get_ipython().run_line_magic('config', 'SparkSql.limit=20')

In [6]:
# Define the schema of the streaming data
streaming_schema = StructType  # 구조체 타입([
    StructField  # 구조체 필드("order_id", IntegerType()),
    StructField  # 구조체 필드("product_id", IntegerType()),
    StructField  # 구조체 필드("quantity", IntegerType()),
    StructField  # 구조체 필드("timestamp", IntegerType())
])

streaming_df = (spark.readStream  # 스트리밍 데이터 읽기
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "orders")
      .option("startingOffsets", "earliest")
      .option("failOnDataLoss", "false")
      .load(  # 파일 로드)
      .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정'value', from_json(col('value').cast("STRING"), streaming_schema)))

streaming_df = (streaming_df
      .select(  # 컬럼 선택
          col(  # 컬럼 참조'value.order_id').alias('order_id'),
          col(  # 컬럼 참조'value.product_id').alias('product_id'),
          col(  # 컬럼 참조'value.quantity').alias('quantity'),
          to_timestamp(col(  # 컬럼 참조"timestamp"), "MM/dd/yyyy, HH:mm:ss" ).alias('timestamp'))
     )

In [7]:
# Define a list of tuples
product_details = [
    (1001, "Laptop", 999.99),
    (1002, "Mouse", 19.99),
    (1003, "Keyboard", 29.99),
    (1004, "Monitor", 199.99),
    (1005, "Speaker", 49.99)
]

# Define a list of column names
columns = ["product_id", "name", "price"]

# 생성 a DataFrame from the list of tuples
static_df = spark.createDataFrame(product_details, columns)

In [8]:
# Join the streaming data with the static data
joined_df = (streaming_df
             .join(  # 데이터프레임 조인static_df,streaming_df.product_id == static_df.product_id,"inner")
             .drop(  # 컬럼 삭제static_df.product_id)
             .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정'invoice_amount', streaming_df.quantity*static_df.price))

In [10]:
query = (joined_df.writeStream  # 스트리밍 데이터 쓰기
   .format("delta")  # Delta Lake 형식으로 저장
   .outputMode(  # 스트리밍 출력 모드 설정"append")
         .option("failOnDataLoss", "true")
   .option("checkpointLocation", "/opt/workspace/data/delta_lake/joining-stream-static/orders/_checkpoints/")
   .start("/opt/workspace/data/delta_lake/joining-stream-static/orders")
)

[Stage 0:>                                                          (0 + 1) / 1]

In [12]:
%%sparksql
SELECT * FROM delta.`/opt/workspace/data/delta_lake/joining-stream-static/orders`;

                                                                                

0,1,2,3,4,5,6
order_id,product_id,quantity,timestamp,name,price,invoice_amount
246759,1001,1,2024-02-04 18:43:54.858000,Laptop,999.99,999.99
470003,1001,4,2024-02-04 18:44:04.871000,Laptop,999.99,3999.96
200860,1004,1,2024-02-04 18:43:44.847000,Monitor,199.99,199.99
460339,1004,4,2024-02-04 18:44:14.882000,Monitor,199.99,799.96




In [14]:
query.stop()

In [15]:
spark.stop()  # Spark 세션 종료 - 리소스 정리 