In [None]:
class Bronze():
    def __init__(self):
        self.base_data_dir = "data/invoices"

    def getSchema(self):
        return """InvoiceNumber string, CreatedTime bigint, StoreID string, PosID string, CashierID string,
                CustomerType string, CustomerCardNo string, TotalAmount double, NumberOfItems bigint, 
                PaymentMethod string, TaxableAmount double, CGST double, SGST double, CESS double, 
                DeliveryType string,
                DeliveryAddress struct<AddressLine string, City string, ContactNumber string, PinCode string, 
                State string>,
                InvoiceLineItems array<struct<ItemCode string, ItemDescription string, 
                    ItemPrice double, ItemQty bigint, TotalValue double>>
            """
    
    def readInvoice(self):
        return (spark.readStream
                    .format("json")
                    .schema(self.getSchema())
                    # .option("cleanSource", "delete") # DELETE files that are processed by previous Micro-batch 
                    .option("cleanSource","archive") # move processed file from current dir to some archived location
                    .option("sourceArchiveDir", f"{self.base_data_dir}/data/medallion/invoices_archive")
                    .load(f"{self.base_data_dir}/data/medallion/invoices")
                )
    
    def process(self):
        print(f"\nStarting Bronze Stream...", end="")
        invoiceDF = self.readInvoice() 
        # drop table if exists
        spark.sql("DROP TABLE IF EXISTS invoices_bz")
        sQuery = ( invoiceDF.writeStream
                        .queryName("bronze-ingestion")
                        .option("checkpointLocation", f"{self.base_data_dir}/checkpoint/invoices_bz")
                        .outputMode("append")
                        .toTable("invoices_bz")
                )
        print("Done")
        return sQuery

In [None]:
class Silver():
    def __init__(self):
        self.base_data_dir = "data/invoices"
    
    def readInvoice(self):
        return ( spark.readStream
                    .table("invoices_bz")
        )

    def explodeInvoices(self, invoiceDF):
        return ( invoiceDF.selectExpr("InvoiceNumber", "CreatedTime", "StoreID", "PosID",
                                      "CustomerType", "PaymentMethod", "DeliveryType", "DeliveryAddress.City",
                                      "DeliveryAddress.State", "DeliveryAddress.PinCode",
                                      "explode(InvoiceLineItems) as LineItem")
                )
    
    def flattenInvoices(self, explodeDF):
        from pyspark.sql.functions import expr
        return ( explodeDF.withColumn("ItemCode", expr("LineItem.ItemCode"))
                        .withColumn("ItemDescription", expr("LineItem.ItemDescription"))
                        .withColumn("ItemPrice", expr("LineItem.ItemPrice"))
                        .withColumn("ItemQty", expr("LineItem.ItemQty"))
                        .withColumn("TotalValue", expr("LineItem.TotalValue"))
                        .drop("LineItem")
                )
    
    def appendInvoices(self, flattenedDF):
        return ( flattenedDF.writeStream
                            .queryName("silver-processing")
                            .format("parquet")
                            .option("checkpointLocation", f"{self.base_data_dir}/checkpoint/invoice_line_items")
                            .outputMode("append")
                            .toTable("invoice_line_items")
                            # .start(f"{self.base_data_dir}/data/medallion/invoice_line_items")
                )

    def process(self):
        print(f"\nStarting Silver Stream...", end="")
        invoiceDF = self.readInvoice()
        explodeDF = self.explodeInvoices(invoiceDF)
        resultDF = self.flattenInvoices(explodeDF)
        sQuery = self.appendInvoices(resultDF)
        print("Done.\n")
        return sQuery