In [9]:
'''
    Kafka notes:
        
        Topic has replicator factor: number of times the partition replicates accross brokers
        One brocker is the Leader for the partiton, the others are the Followers
        The Leader is Read/Write
        The followers will replicate in case the leader dies
        Replication factor number cannot be greater than the number of brokers 
        
        kafka-topics --list --bootstrap-server localhost:19092
        kafka-topics --create --topic consume-kafka --partitions 3 --replication-factor 1 --bootstrap-server localhost:19092
        kafka-topics --describe --topic consume-kafka --bootstrap-server localhost:19092
        kafka-get-offsets --topic consume-kafka --bootstrap-server localhost:19092
        kafka-console-producer --topic consume-kafka --bootstrap-server localhost:19092
        kafka-console-consumer --topic consume-kafka --bootstrap-server localhost:19092 --partition 0 --offset earliest
'''

'\n    Kafka notes:\n        \n        Topic has replicator factor: number of times the partition replicates accross brokers\n        One brocker is the Leader for the partiton, the others are the Followers\n        The Leader is Read/Write\n        The followers will replicate in case the leader dies\n        Replication factor number cannot be greater than the number of brokers \n        \n        kafka-topics --list --bootstrap-server localhost:19092\n        kafka-topics --create --topic consume-kafka --partitions 3 --replication-factor 1 --bootstrap-server localhost:19092\n        kafka-topics --describe --topic consume-kafka --bootstrap-server localhost:19092\n        kafka-get-offsets --topic consume-kafka --bootstrap-server localhost:19092\n        kafka-console-producer --topic consume-kafka --bootstrap-server localhost:19092\n        kafka-console-consumer --topic consume-kafka --bootstrap-server localhost:19092 --partition 0 --offset earliest\n'

In [10]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0') # spark will install the package we need for kafka connection
    .config("spark.sql.shuffle.partitions", 4)
    .master("local[*]") 
    .getOrCreate()
)

spark

In [11]:
kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option("subscribe", "device-data")
    .option("startingOffsets", "earliest")
    .load()
)

In [12]:
kafka_df.printSchema()


root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [13]:
from pyspark.sql.functions import expr
streaming_df = kafka_df.withColumn("value", expr('cast(value as string)'))


In [14]:
# Schema of the Pyaload

from pyspark.sql.types import StringType, StructField, StructType, ArrayType, LongType
from pyspark.sql.functions import from_json

json_schema = (
    StructType(
    [StructField('customerId', StringType(), True), 
    StructField('data', StructType(
        [StructField('devices', 
                     ArrayType(StructType([ 
                        StructField('deviceId', StringType(), True), 
                        StructField('measure', StringType(), True), 
                        StructField('status', StringType(), True), 
                        StructField('temperature', LongType(), True)
                    ]), True), True)
        ]), True), 
    StructField('eventId', StringType(), True), 
    StructField('eventOffset', LongType(), True), 
    StructField('eventPublisher', StringType(), True), 
    StructField('eventTime', StringType(), True)
    ])
)


streaming_df = streaming_df.withColumn("value_json", from_json("value", json_schema)).selectExpr("value_json.*")
streaming_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [15]:
from pyspark.sql.functions import explode, col

streaming_df = streaming_df.withColumn("data_devices", explode("data.devices"))
streaming_df_flatten = (
    streaming_df.drop("data")
    .withColumn("deviceId", col("data_devices.deviceId"))
    .withColumn("measure", col("data_devices.measure"))
    .withColumn("status", col("data_devices.status"))
    .withColumn("temperature", col("data_devices.temperature"))
    .drop("data_devices")
)



In [17]:


'''
    trigger options:
        once=True, availableNow: start, process the data until no other data and stop the stream (like batch job)
        processingTime='10 secinds': run microbatch in 10 seconds
        continuous='10 seconds': it will run continuoously and checkpoint asynchronously every 10 seconds 

'''

(
 streaming_df_flatten
 .writeStream
 .format("console")
 .outputMode("append")
 .trigger(processingTime='10 seconds')
 .option("checkpointLocation", "/home/jovyan/data/checkpoint_kafka")
 .start()
 .awaitTermination()
)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 