# Kafkaから読み込んでDelta Lakeに書き込み

とりあえず読んだまま書き込む例

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.types import StringType
import os
import time

In [2]:
checkpoint_location = 'file:///tmp/_checkpoints/etl-from-json'
output_base_url = os.environ['OUTPUT_URL']
output_url = output_base_url + 'el_aircon'

In [3]:
bootstrap_servers = 'localhost:9092'
topic_name = 'el_aircon'

In [4]:
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", bootstrap_servers) \
  .option("startingOffsets", "earliest") \
  .option("subscribe", topic_name) \
  .load()

In [5]:
output = df.select(df['key'].cast(StringType()).alias('id'), df['value'].cast(StringType()).alias('state'), df['*'])

In [6]:
output.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", checkpoint_location) \
  .start(output_url)

21/10/17 00:05:46 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


<pyspark.sql.streaming.StreamingQuery at 0x7feb8f669a10>

In [7]:
time.sleep(30)

                                                                                

# 書き込まれたデータを読み取る

バッチ的に読み取って確認する。

In [8]:
written_df = spark.read.format('delta').load(output_url)
written_df

DataFrame[id: string, state: string, key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [9]:
written_df.show()

                                                                                

+------------+-----+--------------------+----------------+---------+---------+------+--------------------+-------------+
|          id|state|                 key|           value|    topic|partition|offset|           timestamp|timestampType|
+------------+-----+--------------------+----------------+---------+---------+------+--------------------+-------------+
|0068#0x0,0x1|49,21|[30 30 36 38 23 3...|[34 39 2C 32 31]|el_aircon|        0|     0|2021-10-17 00:04:...|            0|
|0068#0x0,0x1|49,21|[30 30 36 38 23 3...|[34 39 2C 32 31]|el_aircon|        0|     1|2021-10-17 00:04:...|            0|
|0068#0x0,0x1|49,21|[30 30 36 38 23 3...|[34 39 2C 32 31]|el_aircon|        0|     2|2021-10-17 00:04:...|            0|
+------------+-----+--------------------+----------------+---------+---------+------+--------------------+-------------+

