### Assignment 7

Load imports

In [None]:
import pyspark
from pyspark.sql.types import * # for structured data types
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.functions import from_json, col, avg, when

Read from simulated temerature stream

In [None]:
df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers","sandbox.hortonworks.com:6667") \
    .option("subscribe", "tempstream") \
    .load()

Unpack Kafka binary key and value to string

In [None]:
data = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [None]:
data

Check to see that we have events streaming in by emitting to console

Setting a timeout as I don't know how we'd interrupt the query stream in jupyter since next cell won't run till this one finishes

In [None]:
query = data.writeStream.outputMode("append").format("console").start()
query.awaitTermination(timeout=10) 

Parse to json, first we want to define the schema

In [None]:
# {"server" : "c14-5c2s7", 
#  "timestamp" : "2018-03-03 19:31:39", 
#  "sensors" : {
#        "sensor1" : , 
#        "sensor2" : , 
#        "sensor3" : , 
#        "sensor4" : , 
#        "sensor5" : , 
#        "sensor6" : , 
#        "sensor7" : , 
#        "sensor8" : ,
#        "sensor9" : , 
#        "sensor10" : 
#}}

jsonschema = StructType().add("server", StringType()) \
                     .add("timestamp", StringType()) \
                     .add("sensors", StructType() \
                          .add("sensor1", IntegerType()) \
                          .add("sensor2", IntegerType()) \
                          .add("sensor3", IntegerType()) \
                          .add("sensor4", IntegerType()) \
                          .add("sensor5", IntegerType()) \
                          .add("sensor6", IntegerType()) \
                          .add("sensor7", IntegerType()) \
                          .add("sensor8", IntegerType()) \
                          .add("sensor9", IntegerType()) \
                          .add("sensor10", IntegerType()) 
                        )

Then parse the json using `from_json`

In [None]:
df_data = df.select(from_json(col("value").cast("string"), jsonschema).alias("parsed_value"))

Grab everything in the json for a dataframe of all, raw data

In [None]:
df_raw = df_data.select("parsed_value.*")

Grab all invalid records (any where at least one sensor is a 0 or negative value)

In [None]:
df_invalid = df_data.select("parsed_value.*").where((col('sensors.sensor1') < 1)|
                                               (col('sensors.sensor2') < 1)|
                                               (col('sensors.sensor3') < 1)|
                                               (col('sensors.sensor4') < 1)|
                                               (col('sensors.sensor5') < 1)|
                                               (col('sensors.sensor6') < 1)|
                                               (col('sensors.sensor7') < 1)|
                                               (col('sensors.sensor8') < 1)|
                                               (col('sensors.sensor9') < 1)|
                                               (col('sensors.sensor10') < 1)
                                               )

And just run a few queries to see parsed items in console

In [None]:
query1 = df_raw.writeStream.outputMode("append").format("console").start()
query1.awaitTermination(timeout=10) #set a timeout as I don't know how we'd interrupt the query stream in jupyter since next cell won't run till this one finishes

In [None]:
query2 = df_invalid.writeStream.outputMode("append").format("console").start()
query2.awaitTermination(timeout=10) #set a timeout as I don't know how we'd interrupt the query stream in jupyter since next cell won't run till this one finishes

Write raw and invalid data as parquet into HDFS

In [None]:
query3 = df_raw.writeStream \
  .format("parquet") \
  .option("startingOffsets", "earliest") \
  .option("path", "hdfs://sandbox.hortonworks.com:8020/tmp/temp_raw") \
  .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/temp_raw") \
  .start()

query3.awaitTermination(timeout=30)

In [None]:
query4 = df_invalid.writeStream \
  .format("parquet") \
  .option("startingOffsets", "earliest") \
  .option("path", "hdfs://sandbox.hortonworks.com:8020/tmp/temp_invalid") \
  .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/temp_invalid") \
  .start()

query4.awaitTermination(timeout=30)

And just pull form the HDFS store to make sure we got records there

In [None]:
review_raw = sqlContext.read.parquet('hdfs://sandbox.hortonworks.com:8020/tmp/temp_raw')

In [None]:
review_invalid = sqlContext.read.parquet('hdfs://sandbox.hortonworks.com:8020/tmp/temp_invalid')

Calculate running averages for Sensors 1 & 2 and make sure the column name for the dataframe is value so we can stream to Kafka topics

(Also just stream to console quickly to make sure it worked)

In [None]:
sensor1_avg = df_data.select("parsed_value.*").select(avg(when((col('sensors.sensor1') > 1), col('sensors.sensor1'))).cast("string").alias("value"))

In [None]:
query5 = sensor1_avg.writeStream.outputMode("update").format("console").start()
query5.awaitTermination(timeout=12)

In [None]:
sensor2_avg = df_data.select("parsed_value.*").select(avg(when((col('sensors.sensor2') > 1), col('sensors.sensor2'))).cast("string").alias("value"))

In [None]:
query6 = sensor2_avg.writeStream.outputMode("update").format("console").start()
query6.awaitTermination(timeout=12)

### Bonus 1 - Write averages to another Kafka topic

Might not work due to bug mentioned in assignment, though I might also be sending to the topic incorrectly

In [None]:
query8 = sensor1_avg.writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "sandbox.hortonworks.com:6667") \
  .option("topic", "avg1") \
  .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/avg1_chkpt") \
  .outputMode("update") \
  .start()
    
query8.awaitTermination(timeout=12)

In [None]:
query8 = sensor2_avg.writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "sandbox.hortonworks.com:6667") \
  .option("topic", "avg2") \
  .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/avg2_chkpt") \
  .outputMode("update") \
  .start()
    
query8.awaitTermination(timeout=12)