# Spark Ingestion

This notebook showcases a simple data ingestion from Kafka's `readings_prepared` topic and 

## Setup

Import all the required libraries and set the stream configuration variables.

In [1]:
import spark.implicits._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql._

Intitializing Scala interpreter ...

Spark Web UI available at http://spark-alert-1-detect-m:8088/proxy/application_1583148924695_0001
SparkContext available as 'sc' (version = 2.4.5, master = yarn, app id = application_1583148924695_0001)
SparkSession available as 'spark'


import spark.implicits._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql._


In [2]:
val kafkaBootstrapServer = "kafka-m:9092"
val kafkaReadingsTopic = "readings_prepared"
val kafkaDedupWatermarkTime = "1 minute"
val bigQueryTargetTable = "smartplugs.readings"
val bigQueryTempBucket = "pandora-sde-case/ingest"
val outputTriggerTime = "1 minute"

kafkaBootstrapServer: String = kafka-m:9092
kafkaReadingsTopic: String = readings_prepared
kafkaStatsTopic: String = alert_1_stats
kafkaDedupWatermarkTime: String = 1 minute
joinWatermarkTime: String = 1 minute
bigQueryTargetTable: String = smartplugs.alert_1_anomaly
bigQueryTempBucket: String = pandora-sde-case
outputTriggerTime: String = 1 minute


## Define The Required Schema

In [3]:
// This will be used to give the source `readings_prepared` stream data a schema
val readingsSchema = StructType(Seq(
    StructField("message_id", StringType, false),
    StructField("reading_ts", TimestampType, false),
    StructField("reading_value", FloatType, false),
    StructField("reading_type", IntegerType, false),
    StructField("plug_id", IntegerType, false),
    StructField("household_id", IntegerType, false),
    StructField("house_id", IntegerType, false)
))

readingsSchema: org.apache.spark.sql.types.StructType = StructType(StructField(message_id,StringType,false), StructField(reading_ts,TimestampType,false), StructField(reading_value,FloatType,false), StructField(reading_type,IntegerType,false), StructField(plug_id,IntegerType,false), StructField(household_id,IntegerType,false), StructField(house_id,IntegerType,false))
statsSchema: org.apache.spark.sql.types.StructType = StructType(StructField(house_id,IntegerType,false), StructField(hour,IntegerType,false), StructField(mean,FloatType,false), StructField(m2,FloatType,false), StructField(variance,FloatType,false), StructField(std_dev,FloatType,false), StructField(count,LongType,false), StructField(last_ts,TimestampType,false))


### Read and Parse The Input Data Stream

In [1]:
// Drop duplicates if seen in arbitrary 1 seconds watermark. Bounds are necessary so that Spark does not store ALL records in the stateval readings = spark
val readings = spark
    .readStream 
    .format("kafka")
    .option("kafka.bootstrap.servers", kafkaBootstrapServer)
    .option("subscribe", kafkaReadingsTopic)
    .load()
    .selectExpr("CAST(value AS STRING)")
    .select(from_json($"value", readingsSchema).as("data"))
    .select($"")
    .withWatermark("reading_ts", kafkaDedupWatermarkTime) 
    .dropDuplicates()
    .filter($"reading_type" === 1) // Only take the "current load" measurement

Intitializing Scala interpreter ...

Spark Web UI available at http://spark-alert-1-stats-m:8088/proxy/application_1583148921700_0002
SparkContext available as 'sc' (version = 2.4.5, master = yarn, app id = application_1583148921700_0002)
SparkSession available as 'spark'


<console>: 28: error: not found: value kafkaBootstrapServer

#### Peek at The Input Data Streams

##### Readings

In [6]:
val readingsQuery = readings.writeStream.format("memory").queryName("readings").start()
Thread.sleep(10000)
readingsQuery.status

readingsQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@7e4fdee9
res0: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Getting offsets from KafkaV2[Subscribe[readings_prepared]]",
  "isDataAvailable" : false,
  "isTriggerActive" : true
}


In [24]:
spark.sql("select * from readings").show()

+----------+-------------------+-------------+------------+-------+------------+--------+
|message_id|         reading_ts|reading_value|reading_type|plug_id|household_id|house_id|
+----------+-------------------+-------------+------------+-------+------------+--------+
|  21267584|2013-09-01 01:35:00|       35.099|           0|      1|           0|       1|
|  21371713|2013-09-01 01:36:00|       11.757|           0|      2|           0|       0|
|  22405161|2013-09-01 01:46:20|        0.355|           0|      1|           0|       4|
|  22471510|2013-09-01 01:47:00|          0.0|           1|      0|           0|       4|
|  22636827|2013-09-01 01:48:40|          0.0|           1|      1|           0|       3|
|  23903171|2013-09-01 02:01:20|        0.161|           0|      0|           0|       9|
|  24134854|2013-09-01 02:03:40|        9.853|           1|      2|           0|       0|
|  24302832|2013-09-01 02:05:20|         2.25|           0|      1|           0|       7|
|  2443508

In [8]:
// readingsQuery.stop()
readingsQuery.lastProgress

res2: org.apache.spark.sql.streaming.StreamingQueryProgress =
{
  "id" : "9a99c6a0-e13f-4654-824c-d8fd89919a13",
  "runId" : "9b858307-236e-4829-8378-b927c61ba527",
  "name" : "readings",
  "timestamp" : "2020-03-02T12:48:06.186Z",
  "batchId" : 0,
  "numInputRows" : 33,
  "processedRowsPerSecond" : 0.9723613648417703,
  "durationMs" : {
    "addBatch" : 30895,
    "getBatch" : 11,
    "getEndOffset" : 0,
    "queryPlanning" : 868,
    "setOffsetRange" : 1839,
    "triggerExecution" : 33936,
    "walCommit" : 75
  },
  "eventTime" : {
    "avg" : "2013-09-01T02:30:00.606Z",
    "max" : "2013-09-01T03:18:40.000Z",
    "min" : "2013-09-01T01:35:00.000Z",
    "watermark" : "1970-01-01T00:00:00.000Z"
  },
  "stateOperators" : [ {
    "numRowsTotal" : 33,
    "numRowsUpdated" : 33,
    "memo...

In [9]:
readingsQuery.status

res3: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Processing new data",
  "isDataAvailable" : true,
  "isTriggerActive" : true
}


## Write to BigQuery

Write to BigQuery once a minute. BigQuery works better with batched data.

In [54]:
val ingestQuery = readings
    .writeStream
    .trigger(Trigger.ProcessingTime(outputTriggerTime))
    .foreachBatch{ 
        (batchDF: DataFrame, batchId: Long) =>
            batchDF.write.format("bigquery")
                .option("table", bigQueryTargetTable)
                .option("temporaryGcsBucket", bigQueryTempBucket)
                .mode(SaveMode.Append)
                .save()
    }.start()

ingestQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@379de019


In [None]:
ingestQuery.lastProgress.stateOperators

In [56]:
ingestQuery.status

res42: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Getting offsets from KafkaV2[Subscribe[alert_1_stats]]",
  "isDataAvailable" : false,
  "isTriggerActive" : true
}


In [53]:
// ingestQuery.stop()