### Assignment 7

Load imports

In [1]:
import pyspark
from pyspark.sql.types import * # for structured data types
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.functions import from_json, col, avg, when

Read from simulated temerature stream

In [2]:
df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers","sandbox.hortonworks.com:6667") \
    .option("subscribe", "tempstream") \
    .load()

Unpack Kafka binary key and value to string

In [3]:
data = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [4]:
data

DataFrame[key: string, value: string]

Check to see that we have events streaming in by emitting to console

Setting a timeout as I don't know how we'd interrupt the query stream in jupyter since next cell won't run till this one finishes

In [5]:
query = data.writeStream.outputMode("append").format("console").start()
query.awaitTermination(timeout=10) 

False

Parse to json, first we want to define the schema

In [6]:
# {"server" : "c14-5c2s7", 
#  "timestamp" : "2018-03-03 19:31:39", 
#  "sensors" : {
#        "sensor1" : , 
#        "sensor2" : , 
#        "sensor3" : , 
#        "sensor4" : , 
#        "sensor5" : , 
#        "sensor6" : , 
#        "sensor7" : , 
#        "sensor8" : ,
#        "sensor9" : , 
#        "sensor10" : 
#}}

jsonschema = StructType().add("server", StringType()) \
                     .add("timestamp", StringType()) \
                     .add("sensors", StructType() \
                          .add("sensor1", IntegerType()) \
                          .add("sensor2", IntegerType()) \
                          .add("sensor3", IntegerType()) \
                          .add("sensor4", IntegerType()) \
                          .add("sensor5", IntegerType()) \
                          .add("sensor6", IntegerType()) \
                          .add("sensor7", IntegerType()) \
                          .add("sensor8", IntegerType()) \
                          .add("sensor9", IntegerType()) \
                          .add("sensor10", IntegerType()) 
                        )

Then parse the json using `from_json`

In [7]:
df_data = df.select(from_json(col("value").cast("string"), jsonschema).alias("parsed_value"))

Grab everything in the json for a dataframe of all, raw data

In [None]:
df_raw = df_data.select("parsed_value.*")

Grab all invalid records (any where at least one sensor is a 0 or negative value)

In [None]:
df_invalid = df_data.select("parsed_value.*").where((col('sensors.sensor1') < 1)|
                                               (col('sensors.sensor2') < 1)|
                                               (col('sensors.sensor3') < 1)|
                                               (col('sensors.sensor4') < 1)|
                                               (col('sensors.sensor5') < 1)|
                                               (col('sensors.sensor6') < 1)|
                                               (col('sensors.sensor7') < 1)|
                                               (col('sensors.sensor8') < 1)|
                                               (col('sensors.sensor9') < 1)|
                                               (col('sensors.sensor10') < 1)
                                               )

And just run a few queries to see parsed items in console

In [None]:
query1 = df_raw.writeStream.outputMode("append").format("console").start()
query1.awaitTermination(timeout=10) #set a timeout as I don't know how we'd interrupt the query stream in jupyter since next cell won't run till this one finishes

In [None]:
query2 = df_invalid.writeStream.outputMode("append").format("console").start()
query2.awaitTermination(timeout=10) #set a timeout as I don't know how we'd interrupt the query stream in jupyter since next cell won't run till this one finishes

Write raw and invalid data as parquet into HDFS

In [None]:
query3 = df_raw.writeStream \
  .format("parquet") \
  .option("startingOffsets", "earliest") \
  .option("path", "hdfs://sandbox.hortonworks.com:8020/tmp/temp_raw") \
  .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/temp_raw") \
  .start()

query3.awaitTermination(timeout=30)

In [None]:
query4 = df_invalid.writeStream \
  .format("parquet") \
  .option("startingOffsets", "earliest") \
  .option("path", "hdfs://sandbox.hortonworks.com:8020/tmp/temp_invalid") \
  .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/temp_invalid") \
  .start()

query4.awaitTermination(timeout=30)

And just pull form the HDFS store to make sure we got records there

In [None]:
review_raw = sqlContext.read.parquet('hdfs://sandbox.hortonworks.com:8020/tmp/temp_raw')

In [None]:
review_invalid = sqlContext.read.parquet('hdfs://sandbox.hortonworks.com:8020/tmp/temp_invalid')

Calculate running averages for Sensors 1 & 2 and make sure the column name for the dataframe is value so we can stream to Kafka topics

(Also just stream to console quickly to make sure it worked)

In [18]:
sensor1_avg = df_data.select("parsed_value.*").select(avg(when((col('sensors.sensor1') > 1), col('sensors.sensor1'))).cast("string").alias("value"))

In [None]:
query5 = sensor1_avg.writeStream.outputMode("update").format("console").start()
query5.awaitTermination(timeout=12)

In [19]:
sensor2_avg = df_data.select("parsed_value.*").select(avg(when((col('sensors.sensor2') > 1), col('sensors.sensor2'))).cast("string").alias("value"))

In [None]:
query6 = sensor2_avg.writeStream.outputMode("update").format("console").start()
query6.awaitTermination(timeout=12)

### Bonus 1 - Write averages to another Kafka topic

Might not work due to bug mentioned in assignment, though I might also be sending to the topic incorrectly

In [20]:
query8 = sensor1_avg.writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "sandbox.hortonworks.com:6667") \
  .option("topic", "avg1") \
  .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/avg1_chkpt") \
  .outputMode("update") \
  .start()
    
query8.awaitTermination(timeout=12)

StreamingQueryException: 'Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 3, localhost, executor driver): java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.Cast$.apply$default$3()Lscala/Option;\n\tat org.apache.spark.sql.kafka010.KafkaWriteTask.createProjection(KafkaWriteTask.scala:112)\n\tat org.apache.spark.sql.kafka010.KafkaWriteTask.<init>(KafkaWriteTask.scala:39)\n\tat org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(KafkaWriter.scala:90)\n\tat org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(KafkaWriter.scala:89)\n\tat org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)\n\tat org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1951)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1951)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:99)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nDriver stacktrace:\n=== Streaming Query ===\nIdentifier: [id = 2f659bb4-8b99-4ba8-ad89-c84f14760c76, runId = f48005d8-147f-49d1-87ab-40473d600131]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {KafkaSource[Subscribe[tempstream]]: {"tempstream":{"2":388682,"1":388680,"0":388683}}}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nAggregate [cast(avg(cast(CASE WHEN (sensors#150.sensor1 > 1) THEN sensors#150.sensor1 END as bigint)) as string) AS value#156]\n+- Project [parsed_value#20.server AS server#148, parsed_value#20.timestamp AS timestamp#149, parsed_value#20.sensors AS sensors#150]\n   +- Project [jsontostruct(StructField(server,StringType,true), StructField(timestamp,StringType,true), StructField(sensors,StructType(StructField(sensor1,IntegerType,true), StructField(sensor2,IntegerType,true), StructField(sensor3,IntegerType,true), StructField(sensor4,IntegerType,true), StructField(sensor5,IntegerType,true), StructField(sensor6,IntegerType,true), StructField(sensor7,IntegerType,true), StructField(sensor8,IntegerType,true), StructField(sensor9,IntegerType,true), StructField(sensor10,IntegerType,true)),true), cast(value#1 as string)) AS parsed_value#20]\n      +- StreamingExecutionRelation KafkaSource[Subscribe[tempstream]], [key#0, value#1, topic#2, partition#3, offset#4L, timestamp#5, timestampType#6]\n'

In [22]:
query8 = sensor2_avg.writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "sandbox.hortonworks.com:6667") \
  .option("topic", "avg2") \
  .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/avg2_chkpt") \
  .outputMode("update") \
  .start()
    
query8.awaitTermination(timeout=12)

StreamingQueryException: 'Job aborted due to stage failure: Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0 in stage 3.0 (TID 7, localhost, executor driver): java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.Cast$.apply$default$3()Lscala/Option;\n\tat org.apache.spark.sql.kafka010.KafkaWriteTask.createProjection(KafkaWriteTask.scala:112)\n\tat org.apache.spark.sql.kafka010.KafkaWriteTask.<init>(KafkaWriteTask.scala:39)\n\tat org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(KafkaWriter.scala:90)\n\tat org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(KafkaWriter.scala:89)\n\tat org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)\n\tat org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1951)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1951)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:99)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nDriver stacktrace:\n=== Streaming Query ===\nIdentifier: [id = 2cb16930-12e2-4e78-be64-114cd0578887, runId = 6e608535-e980-4e62-a27e-f2eba9d5370a]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {KafkaSource[Subscribe[tempstream]]: {"tempstream":{"2":413326,"1":413324,"0":413327}}}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nAggregate [cast(avg(cast(CASE WHEN (sensors#163.sensor2 > 1) THEN sensors#163.sensor2 END as bigint)) as string) AS value#169]\n+- Project [parsed_value#20.server AS server#161, parsed_value#20.timestamp AS timestamp#162, parsed_value#20.sensors AS sensors#163]\n   +- Project [jsontostruct(StructField(server,StringType,true), StructField(timestamp,StringType,true), StructField(sensors,StructType(StructField(sensor1,IntegerType,true), StructField(sensor2,IntegerType,true), StructField(sensor3,IntegerType,true), StructField(sensor4,IntegerType,true), StructField(sensor5,IntegerType,true), StructField(sensor6,IntegerType,true), StructField(sensor7,IntegerType,true), StructField(sensor8,IntegerType,true), StructField(sensor9,IntegerType,true), StructField(sensor10,IntegerType,true)),true), cast(value#1 as string)) AS parsed_value#20]\n      +- StreamingExecutionRelation KafkaSource[Subscribe[tempstream]], [key#0, value#1, topic#2, partition#3, offset#4L, timestamp#5, timestampType#6]\n'

### Bonus 2 - Perform 5 Second window averages on Sensors 1 and 2

Ran up against a few urgent deadlines all at the same time and was not able to understand this fully enough to make a useful attempt in time.