In [14]:
%run ./05-streaming-batch.ipynb

In [15]:
import findspark
findspark.init()

In [16]:
import shutil
import os
import time
from pyspark.sql import SparkSession

In [20]:
from delta import *

In [17]:
spark = SparkSession.builder \
    .appName("InvoiceStreamApp").getOrCreate()
#     .config("spark.jars.packages", "io.delta:delta-core_2.12:2.1.0") \
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
#     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \


In [18]:
class streamingBatchTestSuite():
    def __init__(self):
        self.base_data_dir = "data/invoices"

    def cleanTests(self):
        print(f"Starting Cleanup...", end='')
        # Cleanup any existing tables
        spark.sql("DROP TABLE IF EXISTS invoice_line_items")
        
        # Remove files and directories
        if os.path.exists(f"{self.base_data_dir}/checkpoint"):
            shutil.rmtree(f"{self.base_data_dir}/checkpoint")
        if os.path.exists(f"{self.base_data_dir}/stream/invoices"):
            shutil.rmtree(f"{self.base_data_dir}/stream/invoices")
        if os.path.exists("/tmp/invoice_line_items"):
            shutil.rmtree("/tmp/invoice_line_items")
        if os.path.exists("/tmp/checkpoints"):
            shutil.rmtree("/tmp/checkpoints")

        os.makedirs(f"{self.base_data_dir}/stream/invoices")
        print("Done\n")

    def ingestData(self, itr):
        print(f"\tStarting Ingestion...", end='')
        shutil.copy(f"{self.base_data_dir}/invoices-{itr}.json", f"{self.base_data_dir}/stream/invoices/invoices-{itr}.json")
        print("Done")

    def assertResult(self, expected_count):
        print(f"\tStarting validation...", end='')
        # Read the Parquet file for validation
        actual_count = (spark.read.parquet("/tmp/invoice_line_items").count())
        print(f"Expected count: {expected_count},\n Actual count: {actual_count}")
        assert expected_count == actual_count, f"Test failed! actual count is {actual_count}"
        print("Done")

    def waitForMicroBatch(self, sleep=30):
        print(f"\tWaiting for {sleep} seconds...", end='')
        time.sleep(sleep)
        print("Done.")

    def runStreamTests(self):
        self.cleanTests()
        iStream = invoiceStreamBatch()
        streamQuery = iStream.process("30 seconds")

        print("Testing first iteration of invoice stream...") 
        self.ingestData(1)
        self.waitForMicroBatch()        
        self.assertResult(1253)
        print("Validation passed.\n")

        print("Testing second iteration of invoice stream...") 
        self.ingestData(2)
        self.waitForMicroBatch()
        self.assertResult(2510)
        print("Validation passed.\n") 

        print("Testing third iteration of invoice stream...") 
        self.ingestData(3)
        self.waitForMicroBatch()
        self.assertResult(3994)
        print("Validation passed.\n")

        streamQuery.stop()

    def runBatchTests(self):
        self.cleanTests()
        iStream = invoiceStreamBatch()

        print("Testing first batch of invoice stream...") 
        self.ingestData(1)
        self.ingestData(2)
        iStream.process("batch")
        self.waitForMicroBatch(30)
        self.assertResult(2510)
        print("Validation passed.\n")

        print("Testing second batch of invoice stream...") 
        self.ingestData(3)
        iStream.process("batch")
        self.waitForMicroBatch(30)
        self.assertResult(3994)
        print("Validation passed.\n") 


In [19]:
# COMMAND ----------
sbTS = streamingBatchTestSuite()
sbTS.runStreamTests()	

Starting Cleanup...Done

Starting Invoice Processing Stream...Done.

Testing first iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Expected count: 1253,
 Actual count: 1253
Done
Validation passed.

Testing second iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Expected count: 2510,
 Actual count: 2510
Done
Validation passed.

Testing third iteration of invoice stream...
	Starting Ingestion...Done
	Waiting for 30 seconds...Done.
	Starting validation...Expected count: 3994,
 Actual count: 3994
Done
Validation passed.



In [20]:

# COMMAND ----------

sbTS.runBatchTests()

Starting Cleanup...Done

Testing first batch of invoice stream...
	Starting Ingestion...Done
	Starting Ingestion...Done
Starting Invoice Processing Stream...Done.

	Waiting for 30 seconds...Done.
	Starting validation...Expected count: 2510,
 Actual count: 2510
Done
Validation passed.

Testing second batch of invoice stream...
	Starting Ingestion...Done
Starting Invoice Processing Stream...Done.

	Waiting for 30 seconds...Done.
	Starting validation...Expected count: 3994,
 Actual count: 3994
Done
Validation passed.

