In [None]:
# # cleanup and setup

# base_data_dir = "/FileStore/data_spark_streaming"

# spark.sql("drop table if exists word_count_table")

# dbutils.fs.rm("/user/hive/warehouse/word_count_table", True)

# dbutils.fs.rm(f"{base_data_dir}/checkpoint", True)
# dbutils.fs.rm(f"{base_data_dir}/data/text", True)

# dbutils.fs.mkdirs(f"{base_data_dir}/data/text")


In [1]:
import os
import shutil
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, trim, lower

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Word Count Test Suite") \
    .getOrCreate()

In [3]:
# Define the base data directory
base_data_dir = "./data"  # assuming you have your data in ./data

In [4]:
# Define the batchWC class
class batchWC():
    def __init__(self):
        self.base_data_dir = base_data_dir

    def getRawData(self):
        lines = (spark.read
                 .format("text")
                 .option("lineSep", ".")
                 .load(f"{self.base_data_dir}/data/text")
                 )
        return lines.select(explode(split(lines.value, " ")).alias("word"))

    def getQualityData(self, rawDF):
        return (rawDF.select(lower(trim(rawDF.word)).alias("word"))
                .where("word is not null")
                .where("word rlike '[a-z]'"))

    def getWordCount(self, qualityDF):
        return qualityDF.groupBy("word").count()

    def overwriteWordCount(self, wordCountDF):
        (wordCountDF.write
         .format("parquet")  # Changed to parquet for local filesystem
         .mode("overwrite")
         .saveAsTable("word_count_table"))

    def wordCount(self):
        print(f"\tExecuting Word Count...", end="")
        rawDF = self.getRawData()
        qualityDF = self.getQualityData(rawDF)
        resultDF = self.getWordCount(qualityDF)
        self.overwriteWordCount(resultDF)
        print("Done")

In [5]:
# Define the batch word count test suite class
class batchWCTestSuite():
    def __init__(self):
        self.base_data_dir = base_data_dir

    def cleanTests(self):
        # Drop table if exists
        spark.sql("drop table if exists word_count_table")

        # Remove files and directories
        if os.path.exists(f"{self.base_data_dir}/checkpoint"):
            shutil.rmtree(f"{self.base_data_dir}/checkpoint")
        if os.path.exists(f"{self.base_data_dir}/data/text"):
            shutil.rmtree(f"{self.base_data_dir}/data/text")

        os.makedirs(f"{self.base_data_dir}/data/text")
        print("Done\n")

    def ingestData(self, itr):
        print(f"\tStarting Ingestion...", end='')
        shutil.copy(f"{self.base_data_dir}/text_data_{itr}.txt", f"{self.base_data_dir}/data/text/text_data_{itr}.txt")
        print("Done")

    def assertResult(self, expected_count):
        actual_count = spark.sql("select sum(count) from word_count_table where substr(word, 1, 1) = 's'").collect()[0][0]
        assert expected_count == actual_count, f"Test failed! actual count is {actual_count}"

    def runTests(self):
        self.cleanTests()
        wc = batchWC()  # Use the batchWC class defined above

        print("Testing first iteration of batch word count...")
        self.ingestData(1)
        wc.wordCount()
        self.assertResult(25)
        print("First iteration of batch word count completed.\n")

        print("Testing second iteration of batch word count...")
        self.ingestData(2)
        wc.wordCount()
        self.assertResult(32)
        print("Second iteration of batch word count completed.\n")

        print("Testing third iteration of batch word count...")
        self.ingestData(3)
        wc.wordCount()
        self.assertResult(37)
        print("Third iteration of batch word count completed.\n")


In [6]:
bwcTS = batchWCTestSuite()
bwcTS.runTests()

Done

Testing first iteration of batch word count...
	Starting Ingestion...Done
	Executing Word Count...

SparkRuntimeException: [LOCATION_ALREADY_EXISTS] Cannot name the managed table as `spark_catalog`.`default`.`word_count_table`, as its associated location 'file:/F:/Data_Engineering/Apache_Spark/Spark_LakeHouse/spark-warehouse/word_count_table' already exists. Please pick a different table name, or remove the existing location first.