In [0]:
class Bronze():
    def __init__(self):
        self.base_data_dir = "/FileStore/data_spark_streaming"
        self.BOOTSTRAP_SERVER = "localhost:9092"

    def getSchema(self):
        return """InvoiceNumber string, CreatedTime bigint, StoreID string, PosID string, CashierID string,
                CustomerType string, CustomerCardNo string, TotalAmount double, NumberOfItems bigint, 
                PaymentMethod string, TaxableAmount double, CGST double, SGST double, CESS double, 
                DeliveryType string,
                DeliveryAddress struct<AddressLine string, City string, ContactNumber string, PinCode string, 
                State string>,
                InvoiceLineItems array<struct<ItemCode string, ItemDescription string, 
                    ItemPrice double, ItemQty bigint, TotalValue double>>
            """
    def ingestFromKafka(self, startingTime =1):
        return ( spark.readStream 
                    .format("kafka")
                .option("kafka.bootstrap.servers", self.BOOTSTRAP_SERVER)
                .option("subscribe", "invoices")
                .option("maxOffsetPerTrigger", 10)
                .option("startingTimestamp", startingTime)
                .load()
        )
    
    def getInvoices(self, kafka_df):
        from pyspark.sql.functions import from_json
        return (kafka_df.select(kafka_df.key.cast("string").alias("key"),
                                from_json(kafka_df.value.cast("string"), self.getSchema()).alias("value"),
                                "topic", "timestamp")
        )
    
    def process(self, startingTime=1):
        print(f"Starting Bronze Streaming Job...", end="")
        rawDF = self.ingestFromKafka(startingTime)
        invoicesDF = self.getInvoices(rawDF)
        sQuery = ( invoicesDF.writeStream
                            .queryName("bronze-ingestion")
                            .option("checkpointLocation", f"{self.base_data_dir}/checkpoint/invoices_bz")
                            .outputMode("append")
                            .toTable("invoices_bz")
                )
        print("\nDone")
        return sQuery