In [7]:
'''
Install ncat in pyspark container to create a socket session

docker exec -it pyspark /bin/bash
sudo apt-get update

# ncat will be used to open a websocted endpoint 
sudo apt-get install ncat

# start socket session on port 9999
ncat -l 9999

'''

'\nInstall ncat in pyspark container to create a socket session\n\ndocker exec -it pyspark /bin/bash\nsudo apt-get update\n\n# ncat will be used to open a websocted endpoint \nsudo apt-get install ncat\n\n'

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from sockets")
    .master("local[*]")
    .getOrCreate()
)

spark


In [2]:
# Write the logic as batch

from pyspark.sql.functions import split, explode, count, col

df  = spark.read.format("text").load("../../data/simple_text.txt")
df.show()


df_words= (
             df
            .withColumn("words", split("value"," "))
            .withColumn("word",explode("words"))
            .select("word")
            .groupBy("word")
            .agg(count("word").alias("word_count"))
            .sort(col("word_count").desc())
          )
                    
df_words.show()

+--------------------+
|               value|
+--------------------+
|Simon had a dog a...|
+--------------------+

+-----+----------+
| word|word_count|
+-----+----------+
|  and|         2|
|    a|         2|
|  dog|         2|
|  cat|         2|
| used|         1|
|  had|         1|
| loce|         1|
|Simon|         1|
|  the|         1|
|simon|         1|
|   to|         1|
+-----+----------+



In [None]:
# Write the logic as streaming

df_stream = spark.readStream.format("socket").option("host","localhost").option("port","9999").load()


df_stream_agg =  (
                     df_stream
                    .withColumn("words", split("value"," "))
                    .withColumn("word",explode("words"))
                    .select("word")
                    .groupBy("word")
                    .agg(count("word").alias("word_count"))
                    .sort(col("word_count").desc())
                  )

df_stream_agg.writeStream.format("console").outputMode("complete").start().awaitTermination()