In [0]:
%run ./07-medallion-approach

In [0]:
dbutils.fs.rm(f"/FileStore/data_spark_streaming/data/medallion", True)

True

In [0]:
class medallionApproachTestSuite():
    def __init__(self):
        self.base_data_dir = "/FileStore/data_spark_streaming"

    def cleanTests(self):
        print("Starting Cleanup...", end="")
        spark.sql("drop table if exists invoices_bz")
        spark.sql("drop table if exists invoice_line_items")
        dbutils.fs.rm("/user/hive/warehouse/invoices_bz", True)
        dbutils.fs.rm("/user/hive/warehouse/invoice_line_items", True)
        
        dbutils.fs.rm(f"{self.base_data_dir}/checkpoint/invoices_bz", True)
        dbutils.fs.rm(f"{self.base_data_dir}/checkpoint/invoice_line_items", True)
        
        dbutils.fs.rm(f"{self.base_data_dir}/data/medallion/invoices_archive", True)
        dbutils.fs.rm(f"{self.base_data_dir}/data/medallion/invoices", True)
        dbutils.fs.mkdirs(f"{self.base_data_dir}/data/medallion/invoices")
        
        print("Done")

    def ingestData(self, itr):
        print(f"\tStarting Ingestion...", end='')
        dbutils.fs.cp(f"{self.base_data_dir}/data/invoices_{itr}.json", f"{self.base_data_dir}/data/medallion/invoices/")
        print("Done")

    def assertResult(self, expected_count):
        print(f"\tStarting validation...", end='')
        actual_count = spark.sql("select count(*) from invoice_line_items").collect()[0][0]
        assert expected_count == actual_count, f"Test failed! actual count is {actual_count}"
        print("Done")

    def waitForMicroBatch(self, sleep=30):
        import time
        print(f"\tWaiting for {sleep} seconds...", end='')
        time.sleep(sleep)
        print("Done.")

    def runTests(self):
        self.cleanTests()
        
        bzStream = Bronze()
        bzQuery = bzStream.process()
        
        slStream = Silver()
        slQuery = slStream.process()

        print("Testing first iteration of invoice stream...") 
        self.ingestData(1)
        self.waitForMicroBatch()        
        self.assertResult(1253)
        print("Validation passed.\n")

        print("Testing second iteration of invoice stream...") 
        self.ingestData(2)
        self.waitForMicroBatch()
        self.assertResult(2510)
        print("Validation passed.\n") 

        print("Testing third iteration of invoice stream...") 
        self.ingestData(3)
        self.waitForMicroBatch()
        self.assertResult(3994)
        print("Validation passed.\n")

        bzQuery.stop()
        slQuery.stop()

        print("Validating Archive...", end="")
        archived_expected = ["invoices_1.json", "invoices_2.json"]
        archived_files = dbutils.fs.ls(f"{self.base_data_dir}/data/medallion/invoices_archive/{self.base_data_dir}/data/medallion/invoices")
        archived_file_names = [f.name for f in archived_files]
        for file_name in archived_file_names:
            assert file_name in archived_expected, f"Archive Validation failed for {file_name}"
        print("Done")



In [0]:
maTS = medallionApproachTestSuite()
maTS.runTests()

Starting Cleanup...Done

Starting Bronze Stream...Done

Starting Silver Stream...Done.

Testing first iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Done
Validation passed.

Testing second iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Done
Validation passed.

Testing third iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Done
Validation passed.

Validating Archive...Done
