In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

class InvoiceBronzeLeyer:
    def __init__(self):
        self.landing_zone = '/Volumes/dev/retail_store/landing_zone'
        self.delivery_add_schema = (StructType([
            StructField("AddressLine", StringType(), True),
            StructField("City", StringType(), True),
            StructField("State", StringType(), True),
            StructField("PinCode", StringType(), True),
            StructField("ContactNumber", StringType(), True)
        ]))
        self.line_item_schema = (StructType([
            StructField("ItemCode", StringType(), True),
            StructField("ItemDescription", StringType(), True),
            StructField("ItemPrice", DoubleType(), True),
            StructField("ItemQty", IntegerType(), True),
            StructField("TotalValue", DoubleType(), True)
        ]))
        self.invoice_schema = (StructType([
            StructField("InvoiceNumber", StringType(), True),
            StructField("CreatedTime", LongType(), True),
            StructField("StoreID", StringType(), True),
            StructField("PosID", StringType(), True),
            StructField("CashierID", StringType(), True),
            StructField("CustomerType", StringType(), True),
            StructField("CustomerCardNo", StringType(), True),
            StructField("TotalAmount", DoubleType(), True),
            StructField("NumberOfItems", IntegerType(), True),
            StructField("PaymentMethod", StringType(), True),
            StructField("TaxableAmount", DoubleType(), True),
            StructField("CGST", DoubleType(), True),
            StructField("SGST", DoubleType(), True),
            StructField("CESS", DoubleType(), True),
            StructField("DeliveryType", StringType(), True),
            StructField("DeliveryAddress", self.delivery_add_schema),
            StructField("InvoiceLineItems", ArrayType(self.line_item_schema))]))
        
    def ingest_raw_files(self):
        raw_df = (spark
                  .readStream
                  .schema(self.invoice_schema)
                  .format("json")
                  .option("cleanSource","archive")
                  .option("sourceArchiveDir","/Volumes/dev/retail_store/invoices_archive")
                  .load(self.landing_zone)
                  .withColumn("input_file_name",input_file_name()))
        
        return raw_df

    def persist_raw_data(self,raw_df):
        invoices_raw_stream_query = (raw_df
                            .writeStream
                            .queryName("invoices_raw_stream")
                            .format("delta")
                            .option("checkpointLocation","/Volumes/dev/retail_store/checkpoint_invoices_raw")
                            .outputMode("append")
                            .toTable("dev.retail_store.invoices_raw")
                            )
        return invoices_raw_stream_query;
    
    def process(self):
        raw_df = self.ingest_raw_files()
        return self.persist_raw_data(raw_df)


In [0]:

class InvoicesSilverLayer:
    def __init__(self):
        pass

    def load_raw_table(self):
        return spark.readStream.table("dev.retail_store.invoices_raw")

    def get_quality_data(self,raw_df):
        return raw_df.where("CustomerCardNo is not null")
    
    def process_data(self,cleansed_df):
        final_df = (cleansed_df
                    .groupBy("CustomerCardNo")
                    .agg(sum("TotalAmount").alias("total_amount"), sum(expr("TotalAmount*0.02")).alias("rewards"))
                )
        
        return final_df
            
    def load_data(self,final_df):
        sQuery =(final_df
                 .writeStream
                 .queryName("customer_revenue_silver_stream")
                 .format("delta")
                 .option("checkpointLocation", "/Volumes/dev/retail_store/checkpoint_customer_revenue")
                 # Spark uses the state store to save the aggregate results of the mircrobatch , so that new microbatch can also use it.
                    # Distributed Location is the default choice to save the state but its not efficent.
                    # Hence, RocksDB is the recommended approach.
                 # Sends all the records to Sink, Hence it will override the entire table. 
                     # New Records
                    # Updated Records
                    # Unchanged Recordss
                 .outputMode("complete")
                 .toTable("DEV.retail_store.customer_revenue")
        )
        return sQuery;
    
    def process(self):
        raw_df = self.load_raw_table()
        cleansed_df = self.get_quality_data(raw_df)
        final_df = self.process_data(cleansed_df)
        return self.load_data(final_df)


In [0]:
bronze_layer = InvoiceBronzeLeyer()
invoices_raw_stream_query = bronze_layer.process()

In [0]:
silver_layer = InvoicesSilverLayer()
invoices_silver_stream_query = silver_layer.process()

In [0]:
invoices_silver_stream_query.stop()

In [0]:
invoices_raw_stream_query.stop()

In [0]:
%sql
select * from dev.retail_store.invoices_raw;


InvoiceNumber,CreatedTime,StoreID,PosID,CashierID,CustomerType,CustomerCardNo,TotalAmount,NumberOfItems,PaymentMethod,TaxableAmount,CGST,SGST,CESS,DeliveryType,DeliveryAddress,InvoiceLineItems,input_file_name
82712625,1595689201906,STR5864,POS465,OAS582,NONPRIME,1211843745,1718.0,1,CARD,1718.0,42.95,42.95,2.1475,TAKEAWAY,,"List(List(568, Pinch Pleated Curtains, 1718.0, 1, 1718.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
58841190,1595689202006,STR2699,POS792,OAS152,PRIME,7475384243,4341.0,2,CASH,4341.0,108.525,108.525,5.4262500000000005,HOME-DELIVERY,"List(548-8023 Vulputate, Rd., Gonda, Uttar Pradesh, 945821, 2509549312)","List(List(523, Oil-lamp clock, 1371.0, 1, 1371.0), List(423, Quilt, 1485.0, 2, 2970.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
46768081,1595689202160,STR2699,POS779,OAS161,PRIME,6797767929,3196.0,1,CASH,3196.0,79.9,79.9,3.995,HOME-DELIVERY,"List(Flat No. #296-6151 Neque Road, Varanasi, Uttar Pradesh, 340397, 9083194560)","List(List(518, Hourglass, 1598.0, 2, 3196.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
25233068,1595689260896,STR4899,POS954,OAS151,NONPRIME,3716602332,9941.0,4,CASH,9941.0,248.525,248.525,12.42625,HOME-DELIVERY,"List(Flat No. #396-1703 Ullamcorper, Street, Hassan, Karnataka, 701126, 1157976639)","List(List(208, Canopy bed, 508.0, 2, 1016.0), List(363, Rocking chair, 2021.0, 2, 4042.0), List(363, Rocking chair, 2021.0, 2, 4042.0), List(418, Wing chair, 1431.0, 2, 2862.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
19176672,1595689261097,STR5646,POS639,OAS167,PRIME,9711257112,3303.0,2,CARD,3303.0,82.575,82.575,4.12875,HOME-DELIVERY,"List(House No 424, 5509 Ut Street, Ganganagar, Rajasthan, 692405, 8395635840)","List(List(253, Bathroom cabinet, 1513.0, 1, 1513.0), List(218, Sofa bed, 895.0, 2, 1790.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
45056546,1595689261308,STR4899,POS954,OAS151,NONPRIME,3716602332,4382.0,3,CASH,4382.0,109.55,109.55,5.4775,HOME-DELIVERY,"List(Flat No. #630-2414 Enim. St., Satna, Madhya Pradesh, 991102, 8931080189)","List(List(248, TV tray table, 1947.0, 1, 1947.0), List(473, Mason jars, 1358.0, 1, 1358.0), List(488, Bread knife, 1077.0, 1, 1077.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
98162165,1595689261415,STR2699,POS779,OAS161,PRIME,6797767929,11613.0,4,CASH,11613.0,290.325,290.325,14.51625,HOME-DELIVERY,"List(6363 Non Rd., Darbhanga, Bihar, 570288, 5656699770)","List(List(313, Brewster Chair, 2195.0, 1, 2195.0), List(468, Lunch box, 1467.0, 2, 2934.0), List(518, Hourglass, 1598.0, 2, 3196.0), List(458, Wine glass, 1644.0, 2, 3288.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
43170532,1595689261817,STR5864,POS872,OAS287,PRIME,7589671731,2420.0,1,CASH,2420.0,60.5,60.5,3.025,HOME-DELIVERY,"List(735-1235 Pretium Street, Unnao, Uttar Pradesh, 103540, 4331966638)","List(List(223, Hammock, 1210.0, 2, 2420.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
37063494,1595689261832,STR2699,POS779,OAS161,PRIME,6797767929,9036.0,4,CASH,9036.0,225.9,225.9,11.295,HOME-DELIVERY,"List(444-1842 Dui. Rd., Shivapuri, Madhya Pradesh, 561012, 7243866404)","List(List(588, Sheer Curtains, 801.0, 2, 1602.0), List(623, Box Pleat Drapes, 1683.0, 2, 3366.0), List(228, Divan, 1083.0, 2, 2166.0), List(333, Gaming chair, 1902.0, 1, 1902.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json
25251857,1595689261917,STR2699,POS792,OAS152,PRIME,7475384243,1259.0,1,CASH,1259.0,31.475,31.475,1.57375,HOME-DELIVERY,"List(295-7690 At Street, Shahjahanpur, Uttar Pradesh, 228410, 4624129756)","List(List(513, Flip clock, 1259.0, 1, 1259.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-2.json


In [0]:
%sql
select * from DEV.retail_store.customer_revenue;

CustomerCardNo,total_amount,rewards
2262471989,36859.0,737.18
3716602332,61743.0,1234.8599999999997
5576072500,23994.0,479.88
7829975914,38799.0,775.98
2325763742,30549.0,610.98
7543202868,47796.0,955.92
9316477281,47894.0,957.88
2502121621,30121.0,602.42
8189067868,22515.0,450.3
6797767929,30570.0,611.4000000000001
