In [1]:
%run 01-batch-word-count.ipynb

In [2]:
import findspark
findspark.init()

In [3]:
# Define the batch word count test suite class
import shutil
import os
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("batchWC").getOrCreate()

base_data_dir = "data/text"

class batchWCTestSuite():
    def __init__(self):
        self.base_data_dir = base_data_dir

    def cleanTests(self):
        # Drop table if exists
        spark.sql("drop table if exists word_count_table")

        # Remove files and directories
        if os.path.exists(f"{self.base_data_dir}/checkpoint"):
            shutil.rmtree(f"{self.base_data_dir}/checkpoint")
        if os.path.exists(f"{self.base_data_dir}/data/text"):
            shutil.rmtree(f"{self.base_data_dir}/data/text")

        os.makedirs(f"{self.base_data_dir}/data/text")
        print("Done\n")

    def ingestData(self, itr):
        print(f"\tStarting Ingestion...", end='')
        shutil.copy(f"{self.base_data_dir}/text_data_{itr}.txt", f"{self.base_data_dir}/data/text/text_data_{itr}.txt")
        print("Done")


    def assertResult(self, expected_count):
        print(f"\tStarting validation...", end='')
        actual_count = spark.sql("select sum(count) from word_count_table where substr(word, 1, 1) == 's'").collect()[0][0]
        print(expected_count)
        print(actual_count)
        assert expected_count == int(actual_count), f"Test failed! actual count is {actual_count}"
        print("Done")

    def runTests(self):
        self.cleanTests()
        wc = batchWC()

        print("Testing first iteration of batch word count...") 
        self.ingestData(1)
        wc.wordCount()
        self.assertResult(25)
        print("First iteration of batch word count completed.\n")

        print("Testing second iteration of batch word count...") 
        self.ingestData(2)
        wc.wordCount()
        self.assertResult(32)
        print("Second iteration of batch word count completed.\n") 

        print("Testing third iteration of batch word count...") 
        self.ingestData(3)
        wc.wordCount()
        self.assertResult(37)
        print("Third iteration of batch word count completed.\n")

In [4]:
bt = batchWCTestSuite()
bt.runTests()

Done

Testing first iteration of batch word count...
	Starting Ingestion...Done
	Executing Word Count...Done
	Starting validation...25
25
Done
First iteration of batch word count completed.

Testing second iteration of batch word count...
	Starting Ingestion...Done
	Executing Word Count...Done
	Starting validation...32
32
Done
Second iteration of batch word count completed.

Testing third iteration of batch word count...
	Starting Ingestion...Done
	Executing Word Count...Done
	Starting validation...37
37
Done
Third iteration of batch word count completed.

