# Chapter 8: Structured Streaming
Christoph Windheuser    
May, 2022   
Python examples of chapter 8 (page 207 ff) in the book *Learning Spark*

In [6]:
# Import required python spark libraries
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.streaming import StreamingContext


In [7]:
# create a SparkSession
# This requires access to the internet. If executed offline, an error is thrown

spark = (SparkSession \
         .builder \
         .appName("Chapter_8") \
         .getOrCreate())


# Create a stream
Run the command `nc -lk 9999`in a terminal window.    
All text you type into the terminal will be send as a data stream to port 9999 whenever you hit `Return`.    
`nc` stands for *Netcat* and is a simple computer network utility available under Linux, macOS and Windows. 

# Example of Reading a stream of data
Creating a DataFrame from a text data stream to be received over a socket connection on localhost. Doing a continuous word count on the streaming data and print the results to the console.

Whenever text is typed into the `nc` command in the terminal, the text is processed and the word count is printed out in the console of the Jupyter notebook until the spark command `streamingQuery.stop()` is executed.

In [40]:
lines = (spark
         .readStream.format("socket")
         .option("host", "localhost")
         .option("port", "9999")
         .load()
)

words  = lines.select(split(col("value"), "\\s").alias("word"))
counts = words.groupBy("word").count()

# The directory is created if it does not exist
checkpointDir = "/tmp/checkpoints"


In [41]:
streamingQuery = (counts
                 .writeStream
                 .format("console")
                 .outputMode("complete")  
                 .trigger(processingTime="2 second")
                 .option("checkpointLocation", checkpointDir)
                 .start()
                 )


In [51]:
# Stop the Streaming Query:
streamingQuery.stop()

## Monitoring an Active Stream
Page 223 ff.

In [52]:
# Show last progress of the stream. Only shows results when the stream is active
streamingQuery.lastProgress

{'id': '89ca0050-b240-4dd2-879d-94c053fb5212',
 'runId': '59c7b124-9d56-4086-9302-941a05a37229',
 'name': None,
 'timestamp': '2022-05-21T20:06:04.004Z',
 'batchId': 55,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 0, 'triggerExecution': 0},
 'stateOperators': [{'operatorName': 'stateStoreSave',
   'numRowsTotal': 39,
   'numRowsUpdated': 0,
   'allUpdatesTimeMs': 59,
   'numRowsRemoved': 0,
   'allRemovalsTimeMs': 0,
   'commitTimeMs': 15712,
   'memoryUsedBytes': 97864,
   'numRowsDroppedByWatermark': 0,
   'numShufflePartitions': 200,
   'numStateStoreInstances': 200,
   'customMetrics': {'loadedMapCacheHitCount': 600,
    'loadedMapCacheMissCount': 200,
    'stateOnCurrentVersionSizeBytes': 30952}}],
 'sources': [{'description': 'TextSocketV2[host: localhost, port: 9999]',
   'startOffset': 0,
   'endOffset': 0,
   'latestOffset': 0,
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.

In [53]:
# Show the actual status of the stream:
streamingQuery.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

## Another Example
See https://spark.apache.org/docs/latest/streaming-programming-guide.html
and:
https://github.com/apache/spark/blob/v3.2.1/examples/src/main/python/streaming/network_wordcount.py

The Python code is in the file `network_wordcount.py`.

1. Run the program `nc -lk 9999` in a terminal.    
   This program sends all text entered in the terminal out via port 9999
2. Run the command `spark-submit network_wordcount.py localhost 9999` in another terminal.
3. Each time words are typed in the first terminal, the words are counted in the second terminal
4. Terminate both commands with `Cntr-C`.

# Streaming Data Sources and Sinks
Page 226 ff.

In [17]:
# Create dir /tmp/sparkInputDir

inputDirectoryOfFiles = "/tmp/sparkInputDir"

fileSchema = (StructType()
             .add(StructField("key", IntegerType()))
             .add(StructField("value", StringType()))
             )


In [18]:
inputDF =(spark
          .readStream
          .format("csv")
          .schema(fileSchema)
          .option("maxFilesPerTrigger", 1)
          .load(inputDirectoryOfFiles)
         )


In [19]:
inputDF.isStreaming

True

In [20]:
inputDF.printSchema()

root
 |-- key: integer (nullable = true)
 |-- value: string (nullable = true)



In [21]:
streamingQuery = (inputDF
                 .writeStream
                 .format("console")
                 .outputMode("append")
                 .start()
                 )


Now copy the files `streamingData_xx.csv` one by one from the directory `data/streamingData` to the direcory `/tmp/sparkInputDir`. Everytime a file is copied, the content is read by the `readStream` and written to the console by the `writeStream`.

In [23]:
# Stop streaming
streamingQuery.stop()