In [1]:
%run ./03-invoice-stream.ipynb

In [2]:
import findspark
findspark.init()

In [3]:
import shutil
import os
import time
from pyspark.sql import SparkSession

In [20]:
from delta import *

In [4]:
spark = SparkSession.builder \
    .appName("InvoiceStreamApp").getOrCreate()
#     .config("spark.jars.packages", "io.delta:delta-core_2.12:2.1.0") \
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
#     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \


In [5]:
class invoiceStreamTestSuite():
    def __init__(self):
        self.base_data_dir = "data/invoices"

    def cleanTests(self):
        print(f"Starting Cleanup...", end='')
        # Memory table cleanup is not required as it is temporary and will be reset on application restart

        # Remove files and directories
        if os.path.exists(f"{self.base_data_dir}/checkpoint"):
            shutil.rmtree(f"{self.base_data_dir}/checkpoint")
        if os.path.exists(f"{self.base_data_dir}/results/invoices"):
            shutil.rmtree(f"{self.base_data_dir}/results/invoices")

        os.makedirs(f"{self.base_data_dir}/results/invoices")
        print("Done\n")

    def ingestData(self, itr):
        print(f"\tStarting Ingestion...", end='')
        shutil.copy(f"{self.base_data_dir}/invoices-{itr}.json", f"{self.base_data_dir}/results/invoices/invoices-{itr}.json")
        print("Done")

    def assertResult(self, expected_count):
        print(f"\tStarting validation...", end='')
        actual_count = spark.sql("select count(*) from invoice_line_items").collect()[0][0]
        print(f"Expected count: {expected_count},\n Actual count: {actual_count}")
        assert expected_count == actual_count, f"Test failed! actual count is {actual_count}"
        print("Done")

    def waitForMicroBatch(self, sleep=30):
        print(f"\tWaiting for {sleep} seconds...", end='')
        time.sleep(sleep)
        print("Done.")

    def runTests(self):
        self.cleanTests()
        iStream = invoiceStream()
        streamQuery = iStream.process()

        print("Testing first iteration of invoice stream...") 
        self.ingestData(1)
        self.waitForMicroBatch()        
        self.assertResult(1253)
        print("Validation passed.\n")

        print("Testing second iteration of invoice stream...") 
        self.ingestData(2)
        self.waitForMicroBatch()
        self.assertResult(2510)
        print("Validation passed.\n") 

        print("Testing third iteration of invoice stream...") 
        self.ingestData(3)
        self.waitForMicroBatch()
        self.assertResult(3994)
        print("Validation passed.\n")

        streamQuery.stop()

In [6]:

# # COMMAND ----------

isTS = invoiceStreamTestSuite()
isTS.runTests()	

Starting Cleanup...Done

Starting Invoice Processing Stream...Done.

Testing first iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Expected count: 1253,
 Actual count: 1253
Done
Validation passed.

Testing second iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Expected count: 2510,
 Actual count: 2510
Done
Validation passed.

Testing third iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Expected count: 3994,
 Actual count: 3994
Done
Validation passed.

