In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

class InvoiceBronzeLayer:

    def __init__(self):
        self.bootstrap_server = 'pkc-619z3.us-east1.gcp.confluent.cloud:9092'
        self.jaas_module = 'org.apache.kafka.common.security.plain.PlainLoginModule'
        self.api_key = 'OVPNJYWE56QVX4LD'
        self.api_secret = 'LSK7hPY5Kvqpj12hiaZuLyawzvN+eOXqDXtOgyRdBAcz9j6RxQdx0DlIlSXLU7Jb'
        self.topic = 'invoices'
        self.delivery_add_schema = (StructType([
            StructField("AddressLine", StringType(), True),
            StructField("City", StringType(), True),
            StructField("State", StringType(), True),
            StructField("PinCode", StringType(), True),
            StructField("ContactNumber", StringType(), True)
        ]))
        self.line_item_schema = (StructType([
            StructField("ItemCode", StringType(), True),
            StructField("ItemDescription", StringType(), True),
            StructField("ItemPrice", DoubleType(), True),
            StructField("ItemQty", IntegerType(), True),
            StructField("TotalValue", DoubleType(), True)
        ]))
        self.invoice_schema = (StructType([
            StructField("InvoiceNumber", StringType(), True),
            StructField("CreatedTime", LongType(), True),
            StructField("StoreID", StringType(), True),
            StructField("PosID", StringType(), True),
            StructField("CashierID", StringType(), True),
            StructField("CustomerType", StringType(), True),
            StructField("CustomerCardNo", IntegerType(), True),
            StructField("TotalAmount", DoubleType(), True),
            StructField("NumberOfItems", IntegerType(), True),
            StructField("PaymentMethod", StringType(), True),
            StructField("TaxableAmount", DoubleType(), True),
            StructField("CGST", DoubleType(), True),
            StructField("SGST", DoubleType(), True),
            StructField("CESS", DoubleType(), True),
            StructField("DeliveryType", StringType(), True),
            StructField("DeliveryAddress", self.delivery_add_schema),
            StructField("InvoiceLineItems", ArrayType(self.line_item_schema))]))
    
    def load_raw_data(self):
        raw_df = (spark
                  .readStream
                  .format("kafka")
                  .option("kafka.bootstrap.servers", self.bootstrap_server)
                  .option("kafka.security.protocol", "SASL_SSL")
                  .option("kafka.sasl.mechanism", "PLAIN")
                  .option("kafka.sasl.jaas.config", f"{self.jaas_module} required username='{self.api_key}' password='{self.api_secret}';")
                  .option("maxOffsetsPerTrigger",10)
                  .option("startingTimestamp",1)
                  .option("subscribe", self.topic)
                  .load()
                  )
        
        processed_df = raw_df.select(raw_df.key.cast("string").alias("key"),
                      from_json(raw_df.value.cast("string"),self.invoice_schema).alias("value"),
                      "topic","timestamp")
        
        return processed_df
    
    # toTable is action 
    # toTable also creates the Table if not exists
    def persist_to_bronze(self, processed_df):
        bronze_layer_streaming_query = (processed_df
                                        .writeStream
                                        .queryName("bronze_layer_streaming_query")
                                        .format("delta")
                                        .option("checkPointLocation","/Volumes/dev/retail_store/checkpoint_invoices_raw")
                                        .outputMode("append")
                                        .toTable("dev.retail_store.invoices_raw")
                                        )
        return bronze_layer_streaming_query
    
    def start_streaming(self):
        processed_df = self.load_raw_data()
        return self.persist_to_bronze(processed_df)



In [0]:
bronze_layer = InvoiceBronzeLayer()
bronze_layer_streaming_query = bronze_layer.start_streaming()


In [0]:
bronze_layer_streaming_query.stop()

In [0]:
%sql
show create table  dev.retail_store.invoices_raw;

createtab_stmt
"CREATE TABLE dev.retail_store.invoices_raw (  key STRING,  value STRUCT, InvoiceLineItems: ARRAY>>,  topic STRING,  timestamp TIMESTAMP) USING delta TBLPROPERTIES (  'delta.enableDeletionVectors' = 'true',  'delta.feature.deletionVectors' = 'supported',  'delta.minReaderVersion' = '3',  'delta.minWriterVersion' = '7')"
