In [1]:
import findspark
findspark.init()

In [2]:
%run ./07-medallion-approach.ipynb

In [3]:
import shutil
import os
import time
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.master("local[*]")\
                    .appName("MedallionApproachTest")\
                    .getOrCreate()
# spark = SparkSession.builder \
#     .appName("MedallionApproachTest") \
#     .config("spark.jars.packages", "io.delta:delta-core_2.12:2.1.0") \
#     .getOrCreate()


In [7]:
class medallionApproachTestSuite():
    def __init__(self):
        self.base_data_dir = "data/invoices"
        self.spark_warehouse_dir = "spark-warehouse"

    def cleanTests(self):
        print("Starting Cleanup...", end="")
        spark.sql("DROP TABLE IF EXISTS invoices_bz")
        spark.sql("DROP TABLE IF EXISTS invoice_line_items")
        
        def remove_dir(path):
            if os.path.exists(path):
                shutil.rmtree(path)

        remove_dir(f"{self.base_data_dir}/checkpoint/invoices_bz")
        remove_dir(f"{self.base_data_dir}/checkpoint/invoice_line_items")
        remove_dir(f"{self.base_data_dir}/data/medallion/invoices_archive")
        remove_dir(f"{self.base_data_dir}/data/medallion/invoices")

        #remove dir in spark-warehouse
        print("Start removing tables in spark-warehouse")
        remove_dir(f"{self.spark_warehouse_dir}/invoices_bz")
        remove_dir(f"{self.spark_warehouse_dir}/invoice_line_items")
        print("Done removing tables in spark-warehouse")
        
        os.makedirs(f"{self.base_data_dir}/data/medallion/invoices")
        print("Done")

    def ingestData(self, itr):
        print(f"\tStarting Ingestion...", end='')
        shutil.copy(f"{self.base_data_dir}/invoices-{itr}.json", f"{self.base_data_dir}/data/medallion/invoices/")
        print("Done")

    def assertResult(self, expected_count):
        print(f"\tStarting validation...", end='')
        actual_count = spark.sql("select count(*) from invoice_line_items").collect()[0][0]
        assert expected_count == actual_count, f"Test failed! actual count is {actual_count}"
        print("Done")

    def waitForMicroBatch(self, sleep=30):
        print(f"\tWaiting for {sleep} seconds...", end='')
        time.sleep(sleep)
        print("Done.")

    def runTests(self):
        self.cleanTests()
        
        bzStream = Bronze()
        bzQuery = bzStream.process()
        
        slStream = Silver()
        slQuery = slStream.process()

        print("Testing first iteration of invoice stream...") 
        self.ingestData(1)
        self.waitForMicroBatch()        
        self.assertResult(1253)
        print("Validation passed.\n")

        print("Testing second iteration of invoice stream...") 
        self.ingestData(2)
        self.waitForMicroBatch()
        self.assertResult(2510)
        print("Validation passed.\n") 

        print("Testing third iteration of invoice stream...") 
        self.ingestData(3)
        self.waitForMicroBatch()
        self.assertResult(3994)
        print("Validation passed.\n")

        bzQuery.stop()
        slQuery.stop()

        print("Validating Archive...", end="")
        archived_expected = ["invoices_1.json", "invoices_2.json"]
        archived_files = os.listdir(f"{self.base_data_dir}/data/medallion/invoices_archive")
        for file_name in archived_files:
            assert file_name in archived_expected, f"Archive Validation failed for {file_name}"
        print("Done")



In [8]:
maTS = medallionApproachTestSuite()
maTS.runTests()

Starting Cleanup...Start removing tables in spark-warehouse
Done removing tables in spark-warehouse
Done

Starting Bronze Stream...Done

Starting Silver Stream...Done.

Testing first iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Done
Validation passed.

Testing second iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Done
Validation passed.

Testing third iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Done
Validation passed.

Validating Archive...Done
