# Spark Streaming basics

From chapter 21 in [SDG]

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('streaming').getOrCreate()
datapath = "../../data/sdg/"

Read a file to infer the schema:

In [None]:
static = spark.read.json(datapath + "/activity-data/part-00000*.json")
dataSchema = static.schema


### Start a streaming object.
You may have to execute this cell before running the last cells, since the stream will be consumed by then

In [None]:
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1)\
  .json(datapath + "/activity-data")
streaming.printSchema()

In [None]:
activityCounts = streaming.groupBy("gt").count()

In [None]:
activityQuery = activityCounts.writeStream.queryName("activity_counts")\
  .format("memory").outputMode("complete") \
  .start()


In [None]:
from time import sleep
for x in range(5):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(1)

In [None]:
activityQuery.status

In [None]:
activityQuery.stop()

## Transformations

In [None]:
from pyspark.sql.functions import expr
simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'"))\
  .where("stairs")\
  .where("gt is not null")\
  .select("gt", "model", "arrival_time", "creation_time")\
  .writeStream\
  .queryName("simple_transform")\
  .format("memory")\
  .outputMode("append")\
  .start()

In [None]:
for x in range(5):
    print(spark.sql("SELECT *, (arrival_time - (creation_time/1e6))/ 1000 as delta_msec FROM simple_transform").toPandas())
    sleep(1)
    
simpleTransform.stop() 

## Agregations

In [None]:

deviceModelStats = streaming.cube("gt", "model").avg()\
  .drop("avg(Arrival_time)")\
  .drop("avg(Creation_Time)")\
  .drop("avg(Index)")\
  .writeStream.queryName("device_counts").format("memory")\
  .outputMode("complete")\
  .start()

In [None]:
# there are 8 gt values and 2 models => 16 combinations.
#spark.sql("select model from device_counts").distinct().toPandas()

In [None]:
for x in range(3):
    print(spark.sql("SELECT * FROM device_counts").toPandas())
    print("==========")
    sleep(1)
    
deviceModelStats.stop() 

Perform JOIN between stream and static dataframe

In [None]:
historicalAgg = static.groupBy("gt", "model").avg()
deviceModelStats = streaming.drop("Arrival_Time", "Creation_Time", "Index")\
  .cube("gt", "model").avg()\
  .join(historicalAgg, ["gt", "model"])\
  .writeStream.queryName("device_counts").format("memory")\
  .outputMode("complete")\
  .start()


In [None]:
for x in range(3):
    print(spark.sql("SELECT * FROM device_counts").toPandas())
    print("==========")
    sleep(1)
    
deviceModelStats.stop()

# Check yourself

* What will happen if you change "complete" to "append" in activityQuery?
* What will happen if you change "append" to "complete" in simpleTransform?
* Can you JOIN two stream tables?