In [0]:
from pyspark.sql.functions import *
class BatchWordCount():
    def __init__(self):
        self.landing_zone = '/Volumes/dev/demo_db/landing_zone/'
    
    def load_incoming_files(self):
        return (spark.read
               .format("text")
               .option("lineSep",".")
               .load("/Volumes/dev/demo_db/landing_zone/"))
    
    def clean_data(self, raw_df):
        word_df = raw_df.withColumn("word", explode(split(col("value"), " ")).alias("word")).drop("value")
        cleansed_df = (word_df
               .select(lower(trim(col("word"))).alias("word"))
               .where("word is not null")
               .where("word rlike '[a-z]'"))
        return cleansed_df
    
    def calculate_word_count(self, cleansed_df):
        final_df = (cleansed_df
            .select(substring(col("word"), 0, 1).alias("letter"))
            .groupBy("letter")
            .count().alias("count"))

        return final_df
    
    def persist(self,final_df):
        final_df.write.format("delta").mode("overwrite").saveAsTable("DEV.demo_db.word_count")

    def process(self):
        raw_df = self.load_incoming_files()
        cleansed_df= self.clean_data(raw_df)
        final_df=self.calculate_word_count(cleansed_df)
        self.persist(final_df)
        
        

In [0]:
app = BatchWordCount()
app.process()

In [0]:
%sql

select * from DEV.demo_db.word_count order by letter;

letter,count
a,34
b,6
c,17
d,8
e,21
f,8
g,6
h,3
i,10
j,3


In [0]:
class StreamingWordCountApp:
    def __init__(self):
        self.landing_zone = '/Volumes/dev/demo_db/landing_zone/'
    
    def load_raw_data(self):
        raw_df = (spark
                  .readStream
                  .format("text")
                  .option("lineSep",".")
                  .load(self.landing_zone))
        return raw_df
    
    def get_quality_data(self,raw_df):
        word_df = raw_df.select(explode(split(col("value")," ")).alias("word"))
        cleansed_df = (word_df
                       .select(lower(trim(col("word"))).alias("word"))
                       .where("word is not null")
                       .where("word rlike '[a-z]'")
                       )
        return cleansed_df
    
    def count_words(self,cleansed_df):
        letter_df = cleansed_df.select(substring(col("word"),0,1).alias("letter"))
        agg_df = letter_df.groupBy("letter").count().alias("count")
        return agg_df
    
    def persist(self,agg_df):
        streaming_query = (agg_df
                           .writeStream
                           .option("checkpointLocation", "/Volumes/dev/demo_db/wc_checkpoint_data")
                           .format("delta")
                           .outputMode("complete")
                           .toTable("DEV.demo_db.word_count")
                           )
        return streaming_query
    
    def process(self):
        raw_df = self.load_raw_data()
        cleansed_df = self.get_quality_data(raw_df)
        agg_df = self.count_words(cleansed_df)
        return self.persist(agg_df)





In [0]:
stream_app = StreamingWordCountApp()
squery = stream_app.process()

In [0]:
squery.stop()