In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from sockets")
    .master("local[*]")
    .getOrCreate()
)

spark

In [2]:
# Start the stream on sokcet 
df_stream = spark.readStream.format("socket").option("host","localhost").option("port","9999").load()

df_stream.printSchema()

root
 |-- value: string (nullable = true)



In [None]:
''' 
    Spark streaming executed as microbatch
    At each microbatch spark will create a new job with tasks needed to execute the logic
    
    We can setup correct the spark.sql.shuffle.partitions to optimize the stream
'''
from pyspark.sql.functions import split, explode, count, col

spark.conf.set("spark.sql.shuffle.partitions", 16)
df_stream_agg =  (
                     df_stream
                    .withColumn("words", split("value"," "))
                    .withColumn("word",explode("words"))
                    .select("word")
                    .groupBy("word")
                    .agg(count("word").alias("word_count"))
                    .sort(col("word_count").desc())
                  )

df_stream_agg.writeStream.format("console").outputMode("complete").start().awaitTermination()

In [None]:
'''
        Output mode:
            complete: in every microbatch output all results 
            update:   in every microbatch output only new records and records updated
                      not all writeStream.format support update modes
                      data lake, rdbms, console support update mode
                      file does not support update mode
                      
            append:   for writeStream.format that does not support update 
                       
'''