# Spark Streaming

In [None]:
import string
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

In [None]:
spark = (
    SparkSession.builder 
    .master("local") 
    .appName("BIOS-823") 
    .config("spark.executor.cores", 4) 
    .getOrCreate()    
)

## Low level API

### Window operations

![img](https://spark.apache.org/docs/latest/img/streaming-dstream-window.png)
Source: https://spark.apache.org/docs/latest/img/streaming-dstream-window.png

```python
window()
countByWindow()
reduceByWindow()
reduceByKeyAndWindow()
countByValueAndWindow()
```

In [None]:
sc = spark.sparkContext
ssc = StreamingContext(sc, 1)
ssc.checkpoint('checkpoints')

In [None]:
lines = ssc.textFileStream('data/inputs')
windows = (
    lines.map(lambda line: line.translate(
        str.maketrans('', '', string.punctuation))).
    flatMap(lambda line: line.split()).
    map(lambda word: (word, 1)).
    reduceByKeyAndWindow(
        lambda a, b: a + b,
        lambda a, b: a - b,
        windowDuration=10, 
        slideDuration=5)
)
windows.pprint()

ssc.start()
ssc.awaitTerminationOrTimeout(60)