In [None]:
'''
    Each time the spark.writeStream is called the spark process the dag form spark.readStream
    if i have two spark.writeStream then spark will process twice the stream
    
    The two write streams have seperate checkoint location and might have different offset because the one might process slower the messages 
    To solve this issues we use forEachBatch
    forEachBatch microbatch we execute a python method
    Inside the python method we can have as much spark.writeStreams we want
    
    
'''



'''
postgresql: create table device_data (customerId varchar(100),eventId varchar(100), eventOffset int,eventPublisher varchar(100),eventTime varchar(100),deviceId varchar(100),measure varchar(100),status varchar(100),temperature int);
'''

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.
    builder.
    appName("Spark write to two sinks").
    config("spark.streaming.stopGracefullyOnShutdown", True).
    config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0').
    config("spark.jars", '/home/jovyan/jars/postgresql-42.2.20.jar').
    config("spark.shuffle.partitions", 4).
    master("local[*]").
    getOrCreate()
    
)

spark

In [2]:
kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option("subscribe", "device-data")
    .option("startingOffsets", "earliest")
    .load()
)

In [3]:
from pyspark.sql.functions import expr
streaming_df = kafka_df.withColumn("value", expr('cast(value as string)'))


In [4]:
# Schema of the Pyaload

from pyspark.sql.types import StringType, StructField, StructType, ArrayType, LongType
from pyspark.sql.functions import from_json

json_schema = (
    StructType(
    [StructField('customerId', StringType(), True), 
    StructField('data', StructType(
        [StructField('devices', 
                     ArrayType(StructType([ 
                        StructField('deviceId', StringType(), True), 
                        StructField('measure', StringType(), True), 
                        StructField('status', StringType(), True), 
                        StructField('temperature', LongType(), True)
                    ]), True), True)
        ]), True), 
    StructField('eventId', StringType(), True), 
    StructField('eventOffset', LongType(), True), 
    StructField('eventPublisher', StringType(), True), 
    StructField('eventTime', StringType(), True)
    ])
)


streaming_df = streaming_df.withColumn("value_json", from_json("value", json_schema)).selectExpr("value_json.*")
streaming_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [6]:
from pyspark.sql.functions import explode, col

streaming_df = streaming_df.withColumn("data_devices", explode("data.devices"))
streaming_df_flatten = (
    streaming_df.drop("data")
    .withColumn("deviceId", col("data_devices.deviceId"))
    .withColumn("measure", col("data_devices.measure"))
    .withColumn("status", col("data_devices.status"))
    .withColumn("temperature", col("data_devices.temperature"))
    .drop("data_devices")
)

streaming_df_flatten.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [None]:
# python function that will run for each batch
def write_micro_batch(df, batch_id):
    print(f"Batch_di: {batch_id}")

    # write to parque
    df.write.format("parquet").mode("append").save("/home/jovyan/data/multiplesinks.parquet/")

    # write to postgresql
    (
        df.
        write.
        format("jdbc").
        mode("append").
        option("driver", "org.postgresql.Driver").
        option("url", "jdbc:postgresql://source_postgresql:5432/source_pg").
        option("dbtable", "public.device_data").
        option("user", "root").
        option("password", "root").
        save()
    )

    df.show()
        
        

In [None]:
(
    streaming_df_flatten.
    writeStream.
    foreachBatch(write_micro_batch).
    trigger(processingTime='10 seconds').
    option("checkpointLocation", "/home/jovyan/data/checkpoint_kafka").
    start().
    awaitTermination()
)
    
    
 