In [None]:
# # cleanup and setup

# base_data_dir = "/FileStore/data_spark_streaming"

# spark.sql("drop table if exists word_count_table")

# dbutils.fs.rm("/user/hive/warehouse/word_count_table", True)

# dbutils.fs.rm(f"{base_data_dir}/checkpoint", True)
# dbutils.fs.rm(f"{base_data_dir}/data/text", True)

# dbutils.fs.mkdirs(f"{base_data_dir}/data/text")


In [None]:
%run ./01-stream-word-count

In [1]:
import findspark
findspark.init()

In [None]:
class streamWCTestSuite():
    def __init__(self):
        self.base_data_dir = "/FileStore/data_spark_streaming"

    def cleanTests(self):
        print(f"Starting Cleanup...", end='')
        spark.sql("drop table if exists word_count_table")
        dbutils.fs.rm("/user/hive/warehouse/word_count_table", True)

        dbutils.fs.rm(f"{self.base_data_dir}/checkpoint", True)
        dbutils.fs.rm(f"{self.base_data_dir}/data/text", True)

        dbutils.fs.mkdirs(f"{self.base_data_dir}/data/text")
        print("Done\n")

    def ingestData(self, itr):
        print(f"\tStarting Ingestion...", end='')
        dbutils.fs.cp(f"{self.base_data_dir}/data/text_data_{itr}.txt", f"{self.base_data_dir}/data/text/")
        print("Done")

    def assertResult(self, expected_count):
        print(f"\tStarting validation...", end='')
        actual_count = spark.sql("select sum(count) from word_count_table where substr(word, 1, 1) == 's'").collect()[0][0]
        print(expected_count)
        print(actual_count)
        assert expected_count == int(actual_count), f"Test failed! actual count is {actual_count}"
        print("Done")

    def runTests(self):
        import time
        sleepTime = 30

        self.cleanTests()
        wc = streamWC()
        sQuery = wc.wordCount() 

        print("Testing first iteration of batch word count...") 
        self.ingestData(1)
        print("\tWaiting for {sleepTime} seconds...")
        time.sleep(sleepTime)
        self.assertResult(25)
        print("First iteration of batch word count completed.\n")

        print("Testing second iteration of batch word count...") 
        self.ingestData(2)
        print("\tWaiting for {sleepTime} seconds...")
        time.sleep(sleepTime)
        self.assertResult(32)
        print("Second iteration of batch word count completed.\n") 

        print("Testing third iteration of batch word count...") 
        self.ingestData(3)
        print("\tWaiting for {sleepTime} seconds...")
        time.sleep(sleepTime)
        self.assertResult(37)
        print("Third iteration of batch word count completed.\n")

        sQuery.stop()

In [None]:
swcTS = streamWCTestSuite()
swcTS.runTests()

Starting Cleanup...Done

	Starting Word Count Stream...Done
Testing first iteration of batch word count...
	Starting Ingestion...Done
	Waiting for {sleepTime} seconds...
	Starting validation...25
25
Done
First iteration of batch word count completed.

Testing second iteration of batch word count...
	Starting Ingestion...Done
	Waiting for {sleepTime} seconds...
	Starting validation...32
32
Done
Second iteration of batch word count completed.

Testing third iteration of batch word count...
	Starting Ingestion...Done
	Waiting for {sleepTime} seconds...
	Starting validation...37
37
Done
Third iteration of batch word count completed.

