In [1]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.streaming import DataStreamReader, DataStreamWriter

from typing import Dict

## Kafka Source
The kafka broker is running in the docker container.
~~~
kafka.brokers=PLAINTEXT://kafka-rp:29092
kafka.topic=ecomm.v1.clickstream
kafka.group.id=delta-dldg-1
~~~

> the kafka jars are loaded using the `startup.sh` in the docker compose.



In [11]:
# example 11-3

reader_opts: Dict[str, str] = {
    "subscribe": "ecomm.v1.clickstream",
    "startingOffsets": "earliest",
    "minPartitions": "4",
    "kafka.bootstrap.servers": "PLAINTEXT://kafka-rp:29092",
    "groupIdPrefix": "delta-dldg-1"
}
writer_opts: Dict[str, str] = {
    "checkpointLocation": "/opt/spark/work-dir/data/medallion_1/_checkpoints",
    "mergeSchema": "false",
    "overwriteSchema": "false",
    "path": "/opt/spark/work-dir/data/delta/bronze_raw/"
}

kafka_source_df: DataStreamReader = (
    spark.readStream
    .options(**reader_opts)
    .format("kafka")
)

bronze_layer_stream: DataStreamWriter = (
    kafka_source_df.load()
    .select(col("key"),col("value"),col("topic"),col("timestamp"))
    .withColumn("event_date", to_date(col("timestamp")))
    .writeStream
    .format('delta')
    .options(**writer_opts)
    .partitionBy("event_date")
    .trigger(availableNow=True)
)

In [12]:
streaming_query = bronze_layer_stream.start()

23/10/08 22:14:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [13]:
streaming_query.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [14]:
streaming_query.lastProgress

{'id': '6212ab34-c7ea-4ab0-9ad1-fd0f06102ae5',
 'runId': '7a30b22a-40ba-481d-b4c1-de043fdc1546',
 'name': None,
 'timestamp': '2023-10-08T22:14:06.383Z',
 'batchId': 2,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 0, 'triggerExecution': 17},
 'stateOperators': [],
 'sources': [{'description': 'KafkaV2[Subscribe[ecomm.v1.clickstream]]',
   'startOffset': {'ecomm.v1.clickstream': {'0': 2}},
   'endOffset': {'ecomm.v1.clickstream': {'0': 2}},
   'latestOffset': {'ecomm.v1.clickstream': {'0': 2}},
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0,
   'metrics': {'avgOffsetsBehindLatest': '0.0',
    'maxOffsetsBehindLatest': '0',
    'minOffsetsBehindLatest': '0'}}],
 'sink': {'description': 'DeltaSink[/opt/spark/work-dir/data/delta/bronze_raw]',
  'numOutputRows': -1}}

In [15]:
streaming_query.stop()
