In [None]:
# # cleanup and setup

# base_data_dir = "/FileStore/data_spark_streaming"

# spark.sql("drop table if exists word_count_table")

# dbutils.fs.rm("/user/hive/warehouse/word_count_table", True)

# dbutils.fs.rm(f"{base_data_dir}/checkpoint", True)
# dbutils.fs.rm(f"{base_data_dir}/data/text", True)

# dbutils.fs.mkdirs(f"{base_data_dir}/data/text")


In [1]:
%run ./01-stream-word-count.ipynb

In [2]:
import findspark
findspark.init()

In [3]:
import shutil
import os
import time
from pyspark.sql import SparkSession

In [4]:
# Initialize SparkSession
spark = SparkSession.builder.appName("streamWC").getOrCreate()

base_data_dir = "data/text"

class streamWCTestSuite():
    def __init__(self):
        self.base_data_dir = base_data_dir

    def cleanTests(self):
        print("Starting Cleanup...", end='')

        # Drop table if exists
        spark.sql("drop table if exists word_count_table")

        # Remove files and directories
        if os.path.exists(f"{self.base_data_dir}/checkpoint"):
            shutil.rmtree(f"{self.base_data_dir}/checkpoint")
        if os.path.exists(f"{self.base_data_dir}/stream_data/text"):
            shutil.rmtree(f"{self.base_data_dir}/stream_data/text")

        os.makedirs(f"{self.base_data_dir}/stream_data/text")
        print("Done\n")

    def ingestData(self, itr):
        print(f"\tStarting Ingestion...", end='')
        shutil.copy(f"{self.base_data_dir}/text_data_{itr}.txt", f"{self.base_data_dir}/stream_data/text/text_data_{itr}.txt")
        print("Done")

    def assertResult(self, expected_count):
        print(f"\tStarting validation...", end='')
        actual_count = spark.sql("select sum(count) from word_count_table where substr(word, 1, 1) == 's'").collect()[0][0]
        print("Expected Count:", expected_count)
        print("Actual Count:", actual_count)
        assert expected_count == int(actual_count), f"Test failed! actual count is {actual_count}"
        print("Done")

    def runTests(self):
        sleepTime = 30

        self.cleanTests()
        wc = streamWC()
        sQuery = wc.wordCount()

        print("Testing first iteration of batch word count...") 
        self.ingestData(1)
        print(f"\tWaiting for {sleepTime} seconds...")
        time.sleep(sleepTime)
        self.assertResult(25)
        print("First iteration of batch word count completed.\n")

        print("Testing second iteration of batch word count...") 
        self.ingestData(2)
        print(f"\tWaiting for {sleepTime} seconds...")
        time.sleep(sleepTime)
        self.assertResult(32)
        print("Second iteration of batch word count completed.\n") 

        print("Testing third iteration of batch word count...") 
        self.ingestData(3)
        print(f"\tWaiting for {sleepTime} seconds...")
        time.sleep(sleepTime)
        self.assertResult(37)
        print("Third iteration of batch word count completed.\n")

        sQuery.stop()

In [5]:
swcTS = streamWCTestSuite()
swcTS.runTests()

Starting Cleanup...Done

	Starting Word Count Stream...Done
Testing first iteration of batch word count...
	Starting Ingestion...Done
	Waiting for 30 seconds...
	Starting validation...Expected Count: 25
Actual Count: 25
Done
First iteration of batch word count completed.

Testing second iteration of batch word count...
	Starting Ingestion...Done
	Waiting for 30 seconds...
	Starting validation...Expected Count: 32
Actual Count: 32
Done
Second iteration of batch word count completed.

Testing third iteration of batch word count...
	Starting Ingestion...Done
	Waiting for 30 seconds...
	Starting validation...Expected Count: 37
Actual Count: 37
Done
Third iteration of batch word count completed.

