In [1]:
from pyspark.sql.functions import explode, split, trim, lower
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkConf
from pyspark import SparkContext
import logging
from os.path import abspath
from pathlib import Path
import shutil
from delta import *

In [2]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%y-%m-%d %H:%M:%S",
    level=logging.DEBUG
)

In [3]:
class BatchWordCount():
    def __init__(
        self,
        spark: SparkSession
    ):
        self.spark = spark
        logging.info("Initiated SparkSession.")

    def read_text(
        self,
        path: str,
        format: str = "text",
        line_sep: str = "."
    ) -> DataFrame:
        lines = (
            self.spark.read
            .format(format)
            .option("lineSep", line_sep)
            .load(path)
        )
        raw_sdf = lines.select(explode(split(lines.value, " ")).alias("word"))
        logging.info(f"Read Text Files as DataFrame, {raw_sdf.count()} rows, {len(raw_sdf.columns)} columns, schema: {raw_sdf.schema}")
        return raw_sdf

    def process_text(
        self,
        raw_sdf: DataFrame
    ) -> DataFrame:
        processed_sdf = (
            raw_sdf.select(lower(trim(raw_sdf.word)).alias("word"))
            .where("word is not null")
            .where("word rlike '[a-z]'")
        )
        logging.info(f"Processed DataFrame, {processed_sdf.count()} rows, {len(processed_sdf.columns)} columns, schema: {processed_sdf.schema}")
        return processed_sdf

    def count_words(
        self,
        processed_sdf: DataFrame
    ) -> DataFrame:
        sdf = processed_sdf.groupBy("word").count()
        logging.info(f"Grouped DataFrame by word, {sdf.count()} rows, {len(sdf.columns)} columns, schema: {sdf.schema}")
        return sdf

    def write_table(
        self,
        sdf: DataFrame,
        format: str,
        mode: str,
        table_name: str
    ):
        (
            sdf.write
            .format(format)
            .mode(mode)
            .saveAsTable(table_name)
        )
        logging.info(f"Wrote {sdf.count()} rows, {len(sdf.columns)} columns, schema: {sdf.schema}  to table: {table_name}, format: {format}, mode: {mode}")

In [4]:
class StreamWordCount():
    def __init__(
        self,
        spark: SparkSession
    ):
        self.spark = spark

    def read_text(
        self,
        path: str,
        format: str = "text",
        line_sep: str = "."
    ):
        lines = (
            self.spark.readStream
            .format(format)
            .option("lineSep", line_sep)
            .load(path)
        )
        raw_sdf = lines.select(explode(split(lines.value, " ")).alias("word"))
        logging.info(f"Read Text Files Streaming as DataFrame, {raw_sdf.count()} rows, {len(raw_sdf.columns)} columns, schema: {raw_sdf.schema}")
        return raw_sdf

    def process_text(
        self,
        raw_sdf: DataFrame
    ) -> DataFrame:
        processed_sdf = (
            raw_sdf.select(lower(trim(raw_sdf.word)).alias("word"))
            .where("word is not null")
            .where("word rlike '[a-z]'")
        )
        logging.info(f"Processed Streaming DataFrame, {processed_sdf.count()} rows, {len(processed_sdf.columns)} columns, schema: {processed_sdf.schema}")
        return processed_sdf

    def count_words(
        self,
        processed_sdf: DataFrame
    ) -> DataFrame:
        sdf = processed_sdf.groupBy("word").count()
        logging.info(f"Grouped Streaming DataFrame by word, {sdf.count()} rows, {len(sdf.columns)} columns, schema: {sdf.schema}")
        return sdf

    def write_table(
        self,
        sdf: DataFrame,
        format: str,
        output_mode: str,
        table_name: str,
        checkpoint_location: str
    ):
        squery = (
            sdf.writeStream
            .format(format)
            .option("truncate", value=False)
            .option("checkpointLocation", checkpoint_location)
            .outputMode(output_mode)
            # .toTable(table_name)
            .start()
            .awaitTermination()
        )
        logging.info(f"Wrote streaming {sdf.count()} rows, {len(sdf.columns)} columns, schema: {sdf.schema}  to table: {table_name}, format: {format}, outputMode: {output_mode}")
        return squery

In [5]:
table_name = "word_count_table"
warehouse_location = abspath("spark-warehouse")
logging.info(f"spark-warehouse: {warehouse_location}")

25-04-08 08:24:16 - INFO - spark-warehouse: /home/jovyan/work/spark-warehouse


In [6]:
spark = (
    SparkSession.builder
    .appName("streaming_word_count")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-spark_2.13:3.3.1,org.apache.spark:spark-sql_2.12:3.5.3")
    .config("spark.driver.cores", 2)
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "1g")
    .config("spark.submit.deployMode", "client")
    .config("spark.log.level", "ALL")
    # .config(conf=conf)
    .enableHiveSupport()
    .getOrCreate()
)

25-04-08 08:27:10 - DEBUG - GatewayClient.address is deprecated and will be removed in version 1.0. Use GatewayParameters instead.
25-04-08 08:27:10 - DEBUG - Command to send: A
ec90b042d81623381b8ac78c0fdbec1de70505406db0d49a0ed7902b1f33ddc2

25-04-08 08:27:10 - DEBUG - Answer received: !yv
25-04-08 08:27:10 - DEBUG - Command to send: j
i
rj
org.apache.spark.SparkConf
e

25-04-08 08:27:10 - DEBUG - Answer received: !yv
25-04-08 08:27:10 - DEBUG - Command to send: j
i
rj
org.apache.spark.api.java.*
e

25-04-08 08:27:10 - DEBUG - Answer received: !yv
25-04-08 08:27:10 - DEBUG - Command to send: j
i
rj
org.apache.spark.api.python.*
e

25-04-08 08:27:10 - DEBUG - Answer received: !yv
25-04-08 08:27:10 - DEBUG - Command to send: j
i
rj
org.apache.spark.ml.python.*
e

25-04-08 08:27:10 - DEBUG - Answer received: !yv
25-04-08 08:27:10 - DEBUG - Command to send: j
i
rj
org.apache.spark.mllib.api.python.*
e

25-04-08 08:27:10 - DEBUG - Answer received: !yv
25-04-08 08:27:10 - DEBUG - Command t

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.io.FileNotFoundException: File file:/opt/bitnami/spark/spark-events does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.deploy.history.EventLogFileWriter.requireLogBaseDirAsDirectory(EventLogFileWriters.scala:77)
	at org.apache.spark.deploy.history.SingleEventLogFileWriter.start(EventLogFileWriters.scala:221)
	at org.apache.spark.scheduler.EventLoggingListener.start(EventLoggingListener.scala:81)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:632)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)


25-04-08 08:27:14 - DEBUG - Command to send: p
ro38
e

25-04-08 08:27:14 - DEBUG - Answer received: !ysjava.io.FileNotFoundException: File file:/opt/bitnami/spark/spark-events does not exist\n	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)\n	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)\n	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)\n	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)\n	at org.apache.spark.deploy.history.EventLogFileWriter.requireLogBaseDirAsDirectory(EventLogFileWriters.scala:77)\n	at org.apache.spark.deploy.history.SingleEventLogFileWriter.start(EventLogFileWriters.scala:221)\n	at org.apache.spark.scheduler.EventLoggingListener.start(EventLoggingListener.scala:81)\n	at org.apache.spark.SparkContext.<init>(SparkContext.scala:632)\n	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkCont