In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

class InvoiceBronzeLeyer:
    def __init__(self):
        self.landing_zone = '/Volumes/dev/retail_store/landing_zone'
        self.delivery_add_schema = (StructType([
            StructField("AddressLine", StringType(), True),
            StructField("City", StringType(), True),
            StructField("State", StringType(), True),
            StructField("PinCode", StringType(), True),
            StructField("ContactNumber", StringType(), True)
        ]))
        self.line_item_schema = (StructType([
            StructField("ItemCode", StringType(), True),
            StructField("ItemDescription", StringType(), True),
            StructField("ItemPrice", DoubleType(), True),
            StructField("ItemQty", IntegerType(), True),
            StructField("TotalValue", DoubleType(), True)
        ]))
        self.invoice_schema = (StructType([
            StructField("InvoiceNumber", StringType(), True),
            StructField("CreatedTime", LongType(), True),
            StructField("StoreID", StringType(), True),
            StructField("PosID", StringType(), True),
            StructField("CashierID", StringType(), True),
            StructField("CustomerType", StringType(), True),
            StructField("CustomerCardNo", StringType(), True),
            StructField("TotalAmount", DoubleType(), True),
            StructField("NumberOfItems", IntegerType(), True),
            StructField("PaymentMethod", StringType(), True),
            StructField("TaxableAmount", DoubleType(), True),
            StructField("CGST", DoubleType(), True),
            StructField("SGST", DoubleType(), True),
            StructField("CESS", DoubleType(), True),
            StructField("DeliveryType", StringType(), True),
            StructField("DeliveryAddress", self.delivery_add_schema),
            StructField("InvoiceLineItems", ArrayType(self.line_item_schema))]))
        
    def ingest_raw_files(self):
        raw_df = (spark
                  .readStream
                  .schema(self.invoice_schema)
                  .format("json")
                  .option("cleanSource","archive")
                  .option("sourceArchiveDir","/Volumes/dev/retail_store/invoices_archive")
                  .load(self.landing_zone)
                  .withColumn("input_file_name",input_file_name()))
        
        return raw_df

    def persist_raw_data(self,raw_df):
        invoices_raw_stream_query = (raw_df
                            .writeStream
                            .queryName("invoices_raw_stream")
                            .format("delta")
                            .option("checkpointLocation","/Volumes/dev/retail_store/checkpoint_invoices_raw")
                            .outputMode("append")
                            .toTable("dev.retail_store.invoices_raw")
                            )
        return invoices_raw_stream_query;
    
    def process(self):
        raw_df = self.ingest_raw_files()
        return self.persist_raw_data(raw_df)


In [0]:

class InvoicesSilverLayer:
    def __init__(self):
        pass

    def load_raw_table(self):
        return spark.readStream.table("dev.retail_store.invoices_raw")

    def get_quality_data(self,raw_df):
        return raw_df.where("CustomerCardNo is not null")
    
    def process_data(self,cleansed_df):
        final_df = (cleansed_df
                    .groupBy("CustomerCardNo")
                    .agg(sum("TotalAmount").alias("total_amount"), sum(expr("TotalAmount*0.02")).alias("rewards"))
                )
        
        return final_df
    
    def upsert(self,df,batchId):
        df.createOrReplaceTempView("customer_revenue_silver")
        merge_statement = """
         merge into DEV.retail_store.customer_revenue t
         using customer_revenue_silver s
         on t.CustomerCardNo = s.CustomerCardNo
         when matched then update set t.total_amount = s.total_amount, t.rewards = s.rewards
         when not matched then insert *
        """
        df._jdf.sparkSession().sql(merge_statement)
            
    def load_data(self,final_df):
        sQuery =(final_df
                 .writeStream
                 .queryName("customer_revenue_silver_stream")
                 .format("delta")
                 .option("checkpointLocation", "/Volumes/dev/retail_store/checkpoint_customer_revenue")
                 # Spark uses the state store to save the aggregate results of the mircrobatch , so that new microbatch can also use it.
                    # Distributed Location is the default choice to save the state but its not efficent.
                    # Hence, RocksDB is the recommended approach.
                 # Update mode sends only the new and chnaged records to Sink.
                 .outputMode("update")
                 .foreachBatch(self.upsert)
                 .start()
        )
        return sQuery;
    
    def process(self):
        raw_df = self.load_raw_table()
        cleansed_df = self.get_quality_data(raw_df)
        final_df = self.process_data(cleansed_df)
        return self.load_data(final_df)


In [0]:
bronze_layer = InvoiceBronzeLeyer()
invoices_raw_stream_query = bronze_layer.process()

In [0]:
silver_layer = InvoicesSilverLayer()
invoices_silver_stream_query = silver_layer.process()

In [0]:
invoices_silver_stream_query.stop()

In [0]:
invoices_raw_stream_query.stop()

In [0]:
%sql
select * from dev.retail_store.invoices_raw;


InvoiceNumber,CreatedTime,StoreID,PosID,CashierID,CustomerType,CustomerCardNo,TotalAmount,NumberOfItems,PaymentMethod,TaxableAmount,CGST,SGST,CESS,DeliveryType,DeliveryAddress,InvoiceLineItems,input_file_name
94201418,1595689270697,STR7443,POS365,OAS845,PRIME,8790333340,9000.0,4,CASH,9000.0,225.0,225.0,11.25,HOME-DELIVERY,"List(House No 383, 4427 Pellentesque Rd., Bokaro Steel City, Jharkhand, 509723, 1442202063)","List(List(258, Closet, 1687.0, 2, 3374.0), List(538, Grandmother clock, 1301.0, 1, 1301.0), List(528, Projection clock, 2365.0, 1, 2365.0), List(673, Dough scraper, 980.0, 2, 1960.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
8749479,1595689270697,STR5864,POS872,OAS287,PRIME,7589671731,7646.0,4,CASH,7646.0,191.15,191.15,9.5575,HOME-DELIVERY,"List(1852 Est St., Imphal, Manipur, 120023, 6124913142)","List(List(593, Hanging curtains, 1896.0, 2, 3792.0), List(308, Butterfly chair, 857.0, 2, 1714.0), List(383, Innerspring Mattress, 655.0, 1, 655.0), List(423, Quilt, 1485.0, 1, 1485.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
91509413,1595689270798,STR2629,POS253,OAS737,NONPRIME,2461788838,7453.0,3,CARD,7453.0,186.325,186.325,9.31625,HOME-DELIVERY,"List(House No 740, 6689 Tempor Av., Bharatpur, Rajasthan, 932264, 8563079826)","List(List(528, Projection clock, 2365.0, 2, 4730.0), List(503, Chef's knife, 1973.0, 1, 1973.0), List(653, Browning tray, 375.0, 2, 750.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
54315437,1595689270798,STR5864,POS872,OAS287,PRIME,7589671731,4006.0,2,CASH,4006.0,100.15,100.15,5.0075,HOME-DELIVERY,"List(Flat No. #335-7984 Senectus Rd., Pali, Rajasthan, 900530, 6125618251)","List(List(238, Dining table, 1582.0, 2, 3164.0), List(273, Bedroom set, 842.0, 1, 842.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
84526449,1595689270798,STR2952,POS152,OAS329,PRIME,3027514652,3288.0,1,CASH,3288.0,82.2,82.2,4.11,TAKEAWAY,,"List(List(458, Wine glass, 1644.0, 2, 3288.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
4014389,1595689270898,STR5494,POS353,OAS969,NONPRIME,8189067868,1894.0,1,CASH,1894.0,47.35,47.35,2.3675,TAKEAWAY,,"List(List(268, Floating shelf, 1894.0, 1, 1894.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
36524242,1595689270898,STR3781,POS129,OAS311,PRIME,4692642935,1955.0,1,CASH,1955.0,48.875,48.875,2.44375,HOME-DELIVERY,"List(8612 Non Rd., Guna, Madhya Pradesh, 210683, 8336980338)","List(List(643, Blow torch, 1955.0, 1, 1955.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
35058148,1595689270898,STR1534,POS135,OAS285,PRIME,5582740626,2944.0,3,CARD,2944.0,73.60000000000001,73.60000000000001,3.68,TAKEAWAY,,"List(List(668, Crab cracker, 785.0, 1, 785.0), List(658, Chinois, 567.0, 1, 567.0), List(633, Cafe Curtains, 796.0, 2, 1592.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
95067626,1595689270999,STR2629,POS172,OAS622,NONPRIME,7829975914,2297.0,2,CASH,2297.0,57.425,57.425,2.87125,HOME-DELIVERY,"List(7409 Laoreet Rd., Jammu, Jammu and Kashmir, 697806, 8335722151)","List(List(233, Coffee table, 1055.0, 1, 1055.0), List(408, Confidante, 1242.0, 1, 1242.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json
17921881,1595689270999,STR1955,POS324,OAS183,NONPRIME,3916555911,773.0,1,CASH,773.0,19.325000000000003,19.325000000000003,0.96625,HOME-DELIVERY,"List(529-4520 Libero. Ave, Raigarh, Chhattisgarh, 183678, 3057906681)","List(List(398, Latex Mattress, 773.0, 1, 773.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-3.json


In [0]:
%sql
select * from DEV.retail_store.customer_revenue;

CustomerCardNo,total_amount,rewards
3716602332,64129.0,1282.58
7829975914,66839.0,1336.78
5576072500,71734.0,1434.68
2325763742,55277.0,1105.54
6733153948,41798.0,835.96
9316477281,31979.0,639.58
7543202868,21488.0,429.76
2502121621,48412.0,968.24
8189067868,27166.0,543.3199999999999
6797767929,42928.0,858.5600000000001
