# Spark Ingestion

This notebook showcases a simple data ingestion from Kafka's `readings_prepared` topic and 

## Setup

Import all the required libraries and set the stream configuration variables.

In [1]:
import spark.implicits._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql._

Intitializing Scala interpreter ...

Spark Web UI available at http://spark-ingest-m:8088/proxy/application_1583313192730_0001
SparkContext available as 'sc' (version = 2.4.5, master = yarn, app id = application_1583313192730_0001)
SparkSession available as 'spark'


import spark.implicits._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql._


In [2]:
val kafkaBootstrapServer = "kafka-m:9092"
val kafkaReadingsTopic = "readings_prepared"
val kafkaDedupWatermarkTime = "1 minute"
val bigQueryTargetTable = "smartplugs.readings"
val bigQueryTempBucket = "pandora-sde-case/ingest"
val outputTriggerTime = "1 minute"

kafkaBootstrapServer: String = kafka-m:9092
kafkaReadingsTopic: String = readings_prepared
kafkaDedupWatermarkTime: String = 1 minute
bigQueryTargetTable: String = smartplugs.readings
bigQueryTempBucket: String = pandora-sde-case/ingest
outputTriggerTime: String = 1 minute


## Define The Required Schema

In [3]:
// This will be used to give the source `readings_prepared` stream data a schema
val readingsSchema = StructType(Seq(
    StructField("message_id", StringType, false),
    StructField("reading_ts", TimestampType, false),
    StructField("reading_value", FloatType, false),
    StructField("reading_type", IntegerType, false),
    StructField("plug_id", IntegerType, false),
    StructField("household_id", IntegerType, false),
    StructField("house_id", IntegerType, false)
))

readingsSchema: org.apache.spark.sql.types.StructType = StructType(StructField(message_id,StringType,false), StructField(reading_ts,TimestampType,false), StructField(reading_value,FloatType,false), StructField(reading_type,IntegerType,false), StructField(plug_id,IntegerType,false), StructField(household_id,IntegerType,false), StructField(house_id,IntegerType,false))


### Read and Parse The Input Data Stream

In [4]:
// Drop duplicates if seen in arbitrary 1 seconds watermark. Bounds are necessary so that Spark does not store ALL records in the stateval readings = spark
val readings = spark
    .readStream 
    .format("kafka")
    .option("kafka.bootstrap.servers", kafkaBootstrapServer)
    .option("subscribe", kafkaReadingsTopic)
    .load()
    .selectExpr("CAST(value AS STRING)")
    .select(from_json($"value", readingsSchema).as("data"))
    .select($"data.*")
    .withWatermark("reading_ts", kafkaDedupWatermarkTime) 
    .dropDuplicates()
    .filter($"reading_type" === 1) // Only take the "current load" measurement

readings: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [message_id: string, reading_ts: timestamp ... 5 more fields]


#### Peek at The Input Data Streams

##### Readings

In [5]:
val readingsQuery = readings.writeStream.format("memory").queryName("readings").start()
Thread.sleep(10000)
readingsQuery.status

readingsQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@607d6f30
res0: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Processing new data",
  "isDataAvailable" : true,
  "isTriggerActive" : true
}


In [13]:
spark.sql("select * from readings").show()

+----------+-------------------+-------------+------------+-------+------------+--------+
|message_id|         reading_ts|reading_value|reading_type|plug_id|household_id|house_id|
+----------+-------------------+-------------+------------+-------+------------+--------+
|  47669710|2013-09-01 05:59:20|          0.0|           1|      1|           0|       5|
|  48068742|2013-09-01 06:03:20|          0.0|           1|      1|           0|       5|
|  49066547|2013-09-01 06:13:20|          0.0|           1|      1|           0|       5|
|  49233526|2013-09-01 06:15:00|          0.0|           1|      0|           0|       9|
|  51762125|2013-09-01 06:40:20|          0.0|           1|      1|           0|       7|
|  50863413|2013-09-01 06:31:20|          0.0|           1|      1|           0|       7|
|  50961779|2013-09-01 06:32:20|       42.532|           1|      2|           0|       0|
|  52861089|2013-09-01 06:51:20|          0.0|           1|      2|           0|       9|
|  5355736

In [7]:
// readingsQuery.stop()
readingsQuery.lastProgress

res2: org.apache.spark.sql.streaming.StreamingQueryProgress =
{
  "id" : "b97e0a73-27c5-4471-85fb-b03a52580ed7",
  "runId" : "8dae5afd-f3d0-4860-9896-b8ce54e1518c",
  "name" : "readings",
  "timestamp" : "2020-03-04T09:19:39.012Z",
  "batchId" : 0,
  "numInputRows" : 0,
  "processedRowsPerSecond" : 0.0,
  "durationMs" : {
    "addBatch" : 28460,
    "getBatch" : 10,
    "getEndOffset" : 0,
    "queryPlanning" : 887,
    "setOffsetRange" : 1637,
    "triggerExecution" : 31260,
    "walCommit" : 64
  },
  "eventTime" : {
    "watermark" : "1970-01-01T00:00:00.000Z"
  },
  "stateOperators" : [ {
    "numRowsTotal" : 0,
    "numRowsUpdated" : 0,
    "memoryUsedBytes" : 44599,
    "customMetrics" : {
      "loadedMapCacheHitCount" : 0,
      "loadedMapCacheMissCount" : 0,
      "stateOnCurre...

In [8]:
readingsQuery.status

res3: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Processing new data",
  "isDataAvailable" : true,
  "isTriggerActive" : true
}


## Write to BigQuery

Write to BigQuery once a minute. BigQuery works better with batched data.

In [9]:
val ingestQuery = readings
    .writeStream
    .trigger(Trigger.ProcessingTime(outputTriggerTime))
    .foreachBatch{ 
        (batchDF: DataFrame, batchId: Long) =>
            batchDF.write.format("bigquery")
                .option("table", bigQueryTargetTable)
                .option("temporaryGcsBucket", bigQueryTempBucket)
                .mode(SaveMode.Append)
                .save()
    }.start()

ingestQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@99f47e8


In [14]:
ingestQuery.lastProgress

res8: org.apache.spark.sql.streaming.StreamingQueryProgress =
{
  "id" : "bbc4556e-1488-4412-aae4-babad5501d44",
  "runId" : "2e442a93-ba5d-4cbc-a7ac-5f4991694a42",
  "name" : null,
  "timestamp" : "2020-03-04T09:20:13.066Z",
  "batchId" : 0,
  "numInputRows" : 28,
  "processedRowsPerSecond" : 0.28718832374329467,
  "durationMs" : {
    "addBatch" : 96921,
    "getBatch" : 0,
    "getEndOffset" : 1,
    "queryPlanning" : 221,
    "setOffsetRange" : 203,
    "triggerExecution" : 97497,
    "walCommit" : 106
  },
  "eventTime" : {
    "avg" : "2013-09-01T07:24:01.428Z",
    "max" : "2013-09-01T07:41:00.000Z",
    "min" : "2013-09-01T07:08:00.000Z",
    "watermark" : "1970-01-01T00:00:00.000Z"
  },
  "stateOperators" : [ {
    "numRowsTotal" : 28,
    "numRowsUpdated" : 28,
    "memoryUsed...

In [15]:
ingestQuery.status

res9: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Processing new data",
  "isDataAvailable" : true,
  "isTriggerActive" : true
}


In [12]:
// ingestQuery.stop()