In [1]:
import findspark
findspark.init()

In [None]:
class invoiceStream():
    def __init__(self):
        self.base_data_dir = "data/invoices"

    def getSchema(self):
        return """InvoiceNumber string, CreatedTime bigint, StoreID string, PosID string, CashierID string,
                CustomerType string, CustomerCardNo string, TotalAmount double, NumberOfItems bigint, 
                PaymentMethod string, TaxableAmount double, CGST double, SGST double, CESS double, 
                DeliveryType string,
                DeliveryAddress struct<AddressLine string, City string, ContactNumber string, PinCode string, 
                State string>,
                InvoiceLineItems array<struct<ItemCode string, ItemDescription string, 
                    ItemPrice double, ItemQty bigint, TotalValue double>>
            """
    
    def readInvoice(self):
        return (spark.readStream
                    .format(self.getSchema())
                    .schema(self.getSchema())
                    .load(f"{self.base_data_dir}/results/invoices")
                )
    
    def explodeInvoices(self, invoiceDF):
        return ( invoiceDF.selectExpr("InvoiceNumber", "CreatedTime", "StoreID", "PosID",
                                      "CustomerType", "PaymentMethhod", "DeliveryType", "DeliveryAddress.City",
                                      "DeliveryAddress.State", "DeliveryAddress.PinCode",
                                      "explode(InvoiceLineItems) as LineItem")
                )
    
    def flattenInvoices(self, explodeDF):
        from pyspark.sql.functions import expr
        return ( exploreDF.withColumn("ItemCode", expr("LineItem.ItemCode"))
                        .withColumn("ItemDescription", expr("LineItem.ItemDescription"))
                        .withColumn("ItemPrice", expr("LineItem.ItemPrice"))
                        .withColumn("ItemQty", expr("LineItem.ItemQty"))
                        .withColumn("TotalValue", expr("LineItem.TotalValue"))
                        .drop("LineItem")
                )
    
    def appendInvoices(self, flattenedDF):
        return ( flattenedDF.writeStream
                            .format("").)