In [None]:
class Bronze():
    def __init__(self):
        self.base_data_dir = "data/invoices"

    def getSchema(self):
        return """InvoiceNumber string, CreatedTime bigint, StoreID string, PosID string, CashierID string,
                CustomerType string, CustomerCardNo string, TotalAmount double, NumberOfItems bigint, 
                PaymentMethod string, TaxableAmount double, CGST double, SGST double, CESS double, 
                DeliveryType string,
                DeliveryAddress struct<AddressLine string, City string, ContactNumber string, PinCode string, 
                State string>,
                InvoiceLineItems array<struct<ItemCode string, ItemDescription string, 
                    ItemPrice double, ItemQty bigint, TotalValue double>>
            """
    
    def readInvoice(self):
        from pyspark.sql.functions import input_file_name
        return (spark.readStream
                    .format("json")
                    .schema(self.getSchema())
                    # .option("cleanSource", "delete") # DELETE files that are processed by previous Micro-batch 
                    # .option("cleanSource","archive") # move processed file from current dir to some archived location
                    # .option("sourceArchiveDir", f"{self.base_data_dir}/data/medallion/invoices_archive")
                    .load(f"{self.base_data_dir}/data/aggregate/invoices")
                    .withColumn("InputFile", input_file_name())
                )
    
    def process(self):
        print(f"\nStarting Bronze Stream...", end="")
        invoiceDF = self.readInvoice() 
        # drop table if exists
        # spark.sql("DROP TABLE IF EXISTS invoices_bz")
        sQuery = ( invoiceDF.writeStream
                        .queryName("bronze-ingestion")
                        .option("checkpointLocation", f"{self.base_data_dir}/checkpoint/invoices_bz")
                        .outputMode("append")
                        .toTable("invoices_bz")
                )
        print("Done")
        return sQuery

In [None]:
class Gold():
    def __init__(self):
        self.base_data_dir = "data/invoices"
        
    def readBronze(self):
        return spark.readStream.table("invoices_bz")

    def getAggregates(self, invoices_df):
        from pyspark.sql.functions import sum, expr
        return (invoices_df.groupBy("CustomerCardNo")
                    .agg(sum("TotalAmount").alias("TotalAmount"),
                         sum(expr("TotalAmount*0.02")).alias("TotalPoints"))
        )

    def saveResults(self, results_df):
        print(f"\nStarting Silver Stream...", end='')
        return (results_df.writeStream
                    .format("memory")
                    .queryName("customer_rewards")
                    .outputMode("complete")
                    .start()
            )
        print("Done")

    def process(self):
        invoices_df = self.readBronze()
        aggregate_df = self.getAggregates(invoices_df)
        sQuery = self.saveResults(aggregate_df)
        return sQuery