In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

class InvoiceBronzeLeyer:
    def __init__(self):
        self.landing_zone = '/Volumes/dev/retail_store/landing_zone'
        self.delivery_add_schema = (StructType([
            StructField("AddressLine", StringType(), True),
            StructField("City", StringType(), True),
            StructField("State", StringType(), True),
            StructField("PinCode", StringType(), True),
            StructField("ContactNumber", StringType(), True)
        ]))
        self.line_item_schema = (StructType([
            StructField("ItemCode", StringType(), True),
            StructField("ItemDescription", StringType(), True),
            StructField("ItemPrice", DoubleType(), True),
            StructField("ItemQty", IntegerType(), True),
            StructField("TotalValue", DoubleType(), True)
        ]))
        self.invoice_schema = (StructType([
            StructField("InvoiceNumber", StringType(), True),
            StructField("CreatedTime", LongType(), True),
            StructField("StoreID", StringType(), True),
            StructField("PosID", StringType(), True),
            StructField("CashierID", StringType(), True),
            StructField("CustomerType", StringType(), True),
            StructField("CustomerCardNo", IntegerType(), True),
            StructField("TotalAmount", DoubleType(), True),
            StructField("NumberOfItems", IntegerType(), True),
            StructField("PaymentMethod", StringType(), True),
            StructField("TaxableAmount", DoubleType(), True),
            StructField("CGST", DoubleType(), True),
            StructField("SGST", DoubleType(), True),
            StructField("CESS", DoubleType(), True),
            StructField("DeliveryType", StringType(), True),
            StructField("DeliveryAddress", self.delivery_add_schema),
            StructField("InvoiceLineItems", ArrayType(self.line_item_schema))]))
        
    def ingest_raw_files(self):
        raw_df = (spark
                  .readStream
                  .schema(self.invoice_schema)
                  .format("json")
                  .option("cleanSource","archive")
                  .option("sourceArchiveDir","/Volumes/dev/retail_store/invoices_archive")
                  .load(self.landing_zone))
        return raw_df

    def persist_raw_data(self,raw_df):
        invoices_raw_stream_query = (raw_df
                            .writeStream
                            .queryName("invoices_raw_stream")
                            .format("delta")
                            .option("checkpointLocation","/Volumes/dev/retail_store/checkpoint_invoices_raw")
                            .option("maxFilesPerTrigger",1)
                            .outputMode("append")
                            .toTable("dev.retail_store.invoices_raw")
                            )
        return invoices_raw_stream_query;
    
    def process(self):
        raw_df = self.ingest_raw_files()
        return self.persist_raw_data(raw_df)


In [0]:
class InvoicesSilverLayer:
    def __init__(self):
        pass

    def load_raw_table(self):
        return spark.readStream.table("dev.retail_store.invoices_raw")

    def get_quality_data(self,raw_df):
        return raw_df.where("InvoiceNumber is not null").where("InvoiceLineItems is not null")
    
    def process_data(self,cleansed_df):
        final_df = (cleansed_df
                    .withColumn("InvoiceLineItem",explode(col("InvoiceLineItems")))
                    .select("InvoiceNumber", "CreatedTime", "StoreID", "PosID", "CustomerType","PaymentMethod","DeliveryType","DeliveryAddress.city","DeliveryAddress.state" ,"DeliveryAddress.pincode","InvoiceLineItem.ItemCode","InvoiceLineItem.ItemDescription","InvoiceLineItem.ItemPrice","InvoiceLineItem.ItemQty","InvoiceLineItem.TotalValue")
                )
        return (final_df
                .withColumnRenamed("InvoiceNumber","invoice_number")
                .withColumnRenamed("CreatedTime","created_time")
                .withColumnRenamed("StoreID","store_id")
                .withColumnRenamed("PosID","pos_id")
                .withColumnRenamed("CustomerType","customer_type")
                .withColumnRenamed("PaymentMethod","payment_method")
                .withColumnRenamed("DeliveryType","delivery_type")
                .withColumnRenamed("ItemCode","item_code")
                .withColumnRenamed("ItemDescription","item_description")
                .withColumnRenamed("ItemPrice","item_price")
                .withColumnRenamed("ItemQty","item_qty")
                .withColumnRenamed("TotalValue","total_value")
                )
        
    
    def load_data(self,final_df):
        sQuery =(final_df
                 .writeStream
                 .queryName("invoices_silver_stream")
                 .format("delta")
                 .option("checkpointLocation", "/Volumes/dev/retail_store/checkpoint_invoices")
                 .outputMode("append")
                 .toTable("DEV.retail_store.invoices")
        )
        return sQuery;
    
    def process(self):
        raw_df = self.load_raw_table()
        cleansed_df = self.get_quality_data(raw_df)
        final_df = self.process_data(cleansed_df)
        return self.load_data(final_df)


In [0]:
bronze_layer = InvoiceBronzeLeyer()
invoices_raw_stream_query = bronze_layer.process()


In [0]:
silver_layer = InvoicesSilverLayer()
invoices_silver_stream_query = silver_layer.process()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-2429001055567067>, line 1[0m
[0;32m----> 1[0m silver_layer [38;5;241m=[39m InvoicesSilverLayer()
[1;32m      2[0m invoices_silver_stream_query [38;5;241m=[39m silver_layer[38;5;241m.[39mprocess()

[0;31mNameError[0m: name 'InvoicesSilverLayer' is not defined

In [0]:
invoices_raw_stream_query.stop()
invoices_silver_stream_query.stop()

In [0]:
%sql
select * from  dev.retail_store.invoices_raw ;

InvoiceNumber,CreatedTime,StoreID,PosID,CashierID,CustomerType,CustomerCardNo,TotalAmount,NumberOfItems,PaymentMethod,TaxableAmount,CGST,SGST,CESS,DeliveryType,DeliveryAddress,InvoiceLineItems,input_file_name
51402977,1595688900348,STR7188,POS956,OAS134,PRIME,4629185211,11114.0,4,CARD,11114.0,277.85,277.85,13.8925,TAKEAWAY,,"List(List(458, Wine glass, 1644.0, 2, 3288.0), List(283, Portable Lamps, 2236.0, 1, 2236.0), List(498, Carving knifes, 1424.0, 2, 2848.0), List(523, Oil-lamp clock, 1371.0, 2, 2742.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
91372973,1595688901219,STR8513,POS163,OAS961,PRIME,2762345282,8272.0,4,CARD,8272.0,206.8,206.8,10.34,HOME-DELIVERY,"List(444-1842 Dui. Rd., Shivapuri, Madhya Pradesh, 561012, 7243866404)","List(List(413, Slipcover, 1896.0, 1, 1896.0), List(483, Teacups and saucers, 1781.0, 1, 1781.0), List(583, Tab Top Curtains, 1329.0, 1, 1329.0), List(558, Balloon clock, 1633.0, 2, 3266.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
93647513,1595688902254,STR8513,POS881,OAS354,NONPRIME,2599848717,3374.0,1,CARD,3374.0,84.35000000000001,84.35000000000001,4.2175,TAKEAWAY,,"List(List(258, Closet, 1687.0, 2, 3374.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
8320594,1595688902254,STR7188,POS825,OAS329,PRIME,7051101351,5824.0,3,CASH,5824.0,145.6,145.6,7.28,HOME-DELIVERY,"List(2465 Laoreet, Street, Dehri, Bihar, 637308, 2662305605)","List(List(288, Hutch, 1812.0, 2, 3624.0), List(558, Balloon clock, 1633.0, 1, 1633.0), List(658, Chinois, 567.0, 1, 567.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
4888607,1595688903258,STR2629,POS172,OAS622,NONPRIME,7829975914,1750.0,2,CASH,1750.0,43.75,43.75,2.1875,HOME-DELIVERY,"List(7114 Eu, Rd., Ratlam, Madhya Pradesh, 925281, 4057182350)","List(List(208, Canopy bed, 508.0, 1, 508.0), List(408, Confidante, 1242.0, 1, 1242.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
99794761,1595688903258,STR6347,POS333,OAS697,NONPRIME,9740604930,6566.0,3,CARD,6566.0,164.15,164.15,8.2075,HOME-DELIVERY,"List(517-8912 Nulla St., Champdani, West Bengal, 680616, 8183195143)","List(List(353, Recliner, 1585.0, 2, 3170.0), List(488, Bread knife, 1077.0, 2, 2154.0), List(408, Confidante, 1242.0, 1, 1242.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
89101510,1595688904763,STR6347,POS994,OAS274,PRIME,6296964163,5560.0,4,CASH,5560.0,139.0,139.0,6.95,TAKEAWAY,,"List(List(493, Boning knife, 513.0, 1, 513.0), List(468, Lunch box, 1467.0, 1, 1467.0), List(348, Navy chair, 1998.0, 1, 1998.0), List(238, Dining table, 1582.0, 1, 1582.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
26723058,1595689028262,STR7188,POS664,OAS971,PRIME,9316477281,5235.0,3,CARD,5235.0,130.875,130.875,6.54375,HOME-DELIVERY,"List(5418 Magna. Rd., Chennai, Tamil Nadu, 386032, 6557358508)","List(List(653, Browning tray, 375.0, 1, 375.0), List(568, Pinch Pleated Curtains, 1718.0, 2, 3436.0), List(498, Carving knifes, 1424.0, 1, 1424.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
96837834,1595689029002,STR6162,POS835,OAS583,NONPRIME,5611417583,13481.0,4,CARD,13481.0,337.02500000000003,337.02500000000003,16.85125,TAKEAWAY,,"List(List(213, Infant bed, 1755.0, 1, 1755.0), List(528, Projection clock, 2365.0, 2, 4730.0), List(288, Hutch, 1812.0, 2, 3624.0), List(203, Bunk bed, 1686.0, 2, 3372.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json
78302645,1595689029506,STR8513,POS155,OAS558,NONPRIME,2389018842,6543.0,3,CASH,6543.0,163.57500000000002,163.57500000000002,8.17875,HOME-DELIVERY,"List(Flat No. #210-902 Neque Street, South Dum Dum, West Bengal, 504795, 7508353683)","List(List(533, Rolling ball clock, 1651.0, 2, 3302.0), List(538, Grandmother clock, 1301.0, 1, 1301.0), List(278, Dining set, 1940.0, 1, 1940.0))",dbfs:/Volumes/dev/retail_store/landing_zone/invoices-1.json


In [0]:
%sql
select * from dev.retail_store.invoices;

invoice_number,created_time,store_id,pos_id,customer_type,payment_method,Delivery_type,city,state,pincode,item_code,item_description,item_price,item_qty,total_value
94201418,1595689270697,STR7443,POS365,PRIME,CASH,HOME-DELIVERY,Bokaro Steel City,Jharkhand,509723.0,258,Closet,1687.0,2,3374.0
94201418,1595689270697,STR7443,POS365,PRIME,CASH,HOME-DELIVERY,Bokaro Steel City,Jharkhand,509723.0,538,Grandmother clock,1301.0,1,1301.0
94201418,1595689270697,STR7443,POS365,PRIME,CASH,HOME-DELIVERY,Bokaro Steel City,Jharkhand,509723.0,528,Projection clock,2365.0,1,2365.0
94201418,1595689270697,STR7443,POS365,PRIME,CASH,HOME-DELIVERY,Bokaro Steel City,Jharkhand,509723.0,673,Dough scraper,980.0,2,1960.0
8749479,1595689270697,STR5864,POS872,PRIME,CASH,HOME-DELIVERY,Imphal,Manipur,120023.0,593,Hanging curtains,1896.0,2,3792.0
8749479,1595689270697,STR5864,POS872,PRIME,CASH,HOME-DELIVERY,Imphal,Manipur,120023.0,308,Butterfly chair,857.0,2,1714.0
8749479,1595689270697,STR5864,POS872,PRIME,CASH,HOME-DELIVERY,Imphal,Manipur,120023.0,383,Innerspring Mattress,655.0,1,655.0
8749479,1595689270697,STR5864,POS872,PRIME,CASH,HOME-DELIVERY,Imphal,Manipur,120023.0,423,Quilt,1485.0,1,1485.0
91509413,1595689270798,STR2629,POS253,NONPRIME,CARD,HOME-DELIVERY,Bharatpur,Rajasthan,932264.0,528,Projection clock,2365.0,2,4730.0
91509413,1595689270798,STR2629,POS253,NONPRIME,CARD,HOME-DELIVERY,Bharatpur,Rajasthan,932264.0,503,Chef's knife,1973.0,1,1973.0
