# Chapter 8: Structured Streaming
Christoph Windheuser    
May, 2022   
Python examples of chapter 8 (page 207 ff) in the book *Learning Spark*

In [None]:
# Import required python spark libraries
import pyspark
from pyspark.sql.functions import col, expr, when, concat, lit, avg
from pyspark.sql.types import StructType,StructField, StringType, IntegerType


In [None]:
#create a SparkSession

spark = (SparkSession \
         .builder \
         .enableHiveSupport() \
         .appName("Chapter_7") \
         .getOrCreate())


# Read a stream of data
Creating a DataFrame from a text data stream to be received over a socket connection on a localhost:

In [None]:
lines = (spark
         .readStream.format("socket")
         .option("host", "localhost")
         .option("port", "9999")
         .load()
)

## Example:
https://spark.apache.org/docs/latest/streaming-programming-guide.html
and:
https://github.com/apache/spark/blob/v3.2.1/examples/src/main/python/streaming/network_wordcount.py

1. Run the program `nc -lk 9999` in a terminal.    
   This program sends all text entered in the terminal out via port 9999
2. Run the program `spark-submit network_wordcount.py localhost 9999` in another terminal.
3. Each time words are typed in the first terminal, the words are counted in the second terminal

In [None]:
# It also works when the program is executed inside the Jupyter Notebook!
# Run nc -lk 9999 in another terminal and type some text.
# You will see the word count as output of this program in this 
# Jupyter Notebook!
#
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

ssc = StreamingContext(sc, 1)

lines = ssc.socketTextStream("localhost", 9999)
counts = lines.flatMap(lambda line: line.split(" "))\
                  .map(lambda word: (word, 1))\
                  .reduceByKey(lambda a, b: a+b)
counts.pprint()

ssc.start()
ssc.awaitTermination()