In [1]:
%run ./10-kafka-to-bronze.ipynb

In [2]:
import findspark
findspark.init()

In [3]:
import shutil
import os
import time
from datetime import datetime, timedelta
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.master("local[*]")\
                    .appName("KafkaBronze")\
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0") \
                    .getOrCreate()

24/08/06 10:54:30 WARN Utils: Your hostname, Min resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/08/06 10:54:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ddutjnrevenge/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ddutjnrevenge/.ivy2/cache
The jars for the packages stored in: /home/ddutjnrevenge/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a4d010a6-e4b1-4789-a7b9-d5f13bd3b3b5;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.0 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.9.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 354ms :: artifact

In [7]:
class kafkaToBronzeTestSuite():
    def __init__(self):
        self.base_data_dir = "data"
        self.spark_warehouse_dir = "spark-warehouse"

    def getTomorrowTimestamp(self):
        tomorrow = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=2)
        return int(time.mktime(tomorrow.timetuple()) * 1000)

    def cleanTests(self):
        print(f"Starting Cleanup...", end='')
        spark.sql("drop table if exists invoices_bz")

        def remove_dir(path):
            if os.path.exists(path):
                shutil.rmtree(path)
        # remove dir in spark-warehouse
        print("Start removing tables in spark-warehouse")
        remove_dir(f"{self.spark_warehouse_dir}/invoices_bz")
        print("Start removing data in checkpoint")
        remove_dir(f"{self.base_data_dir}/checkpoint/invoices_bz")
        print("Done")

    def assertResult(self, expected_count):
        print(f"\tStarting validation...", end='')
        actual_count = spark.sql("select count(*) from invoices_bz").collect()[0][0]
        assert expected_count == actual_count, f"Test failed! actual count is {actual_count}"
        print("Done")

    def waitForMicroBatch(self, sleep=30):
        print(f"\tWaiting for {sleep} seconds...", end='')
        time.sleep(sleep)
        print("Done.")    

    def runTests(self):        
        self.cleanTests()
        bzStream = Bronze()        

        print("Testing Scenario - Start from beginning on a new checkpoint...") 
        bzQuery = bzStream.process()
        self.waitForMicroBatch() 
        bzQuery.stop()       
        self.assertResult(1590)
        print("Validation passed.\n")        

        print("Testing Scenario - Restart from where it stopped on the same checkpoint...")
        bzQuery = bzStream.process()
        self.waitForMicroBatch()
        bzQuery.stop()
        self.assertResult(1590)
        print("Validation passed.\n") 

        # Get timestamp for tomorrow
        tomorrow_timestamp = self.getTomorrowTimestamp()
        print(f"Testing Scenario - Start from {tomorrow_timestamp} on a new checkpoint...") 
        
        def remove_dir(path):
            if os.path.exists(path):
                shutil.rmtree(path)

        print("Start removing data in checkpoint")
        remove_dir(f"{self.base_data_dir}/checkpoint/invoices_bz")
        print("Done")

        bzQuery = bzStream.process(tomorrow_timestamp)
        self.waitForMicroBatch()
        bzQuery.stop()
        self.assertResult(1590)
        print("Validation passed.\n") 

In [8]:
ts = kafkaToBronzeTestSuite()
ts.runTests()

Starting Cleanup...Start removing tables in spark-warehouse
Start removing data in checkpoint
Done
Testing Scenario - Start from beginning on a new checkpoint...
Starting Bronze Streaming Job...Done
	Waiting for 30 seconds...

24/08/06 10:59:27 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/08/06 10:59:27 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

Done.
	Starting validation...Done
Validation passed.

Testing Scenario - Restart from where it stopped on the same checkpoint...
Starting Bronze Streaming Job...Done
	Waiting for 30 seconds...

24/08/06 10:59:58 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/08/06 10:59:58 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


Done.
	Starting validation...Done
Validation passed.

Testing Scenario - Start from 1723050000000 on a new checkpoint...
Start removing data in checkpoint
Done
Starting Bronze Streaming Job...Done
	Waiting for 30 seconds...

24/08/06 11:00:28 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/08/06 11:00:28 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


Done.
	Starting validation...Done
Validation passed.

