# Spark Ingestion

This notebook showcases a simple data ingestion from Kafka's `readings_prepared` topic and 

## Setup

Import all the required libraries and set the stream configuration variables.

In [1]:
import spark.implicits._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql._

Intitializing Scala interpreter ...

Spark Web UI available at http://spark-ingest-m:8088/proxy/application_1583161289898_0001
SparkContext available as 'sc' (version = 2.4.5, master = yarn, app id = application_1583161289898_0001)
SparkSession available as 'spark'


import spark.implicits._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql._


In [2]:
val kafkaBootstrapServer = "kafka-m:9092"
val kafkaReadingsTopic = "readings_prepared"
val kafkaDedupWatermarkTime = "1 minute"
val bigQueryTargetTable = "smartplugs.readings"
val bigQueryTempBucket = "pandora-sde-case/ingest"
val outputTriggerTime = "1 minute"

kafkaBootstrapServer: String = kafka-m:9092
kafkaReadingsTopic: String = readings_prepared
kafkaDedupWatermarkTime: String = 1 minute
bigQueryTargetTable: String = smartplugs.readings
bigQueryTempBucket: String = pandora-sde-case/ingest
outputTriggerTime: String = 1 minute


## Define The Required Schema

In [3]:
// This will be used to give the source `readings_prepared` stream data a schema
val readingsSchema = StructType(Seq(
    StructField("message_id", StringType, false),
    StructField("reading_ts", TimestampType, false),
    StructField("reading_value", FloatType, false),
    StructField("reading_type", IntegerType, false),
    StructField("plug_id", IntegerType, false),
    StructField("household_id", IntegerType, false),
    StructField("house_id", IntegerType, false)
))

readingsSchema: org.apache.spark.sql.types.StructType = StructType(StructField(message_id,StringType,false), StructField(reading_ts,TimestampType,false), StructField(reading_value,FloatType,false), StructField(reading_type,IntegerType,false), StructField(plug_id,IntegerType,false), StructField(household_id,IntegerType,false), StructField(house_id,IntegerType,false))


### Read and Parse The Input Data Stream

In [5]:
// Drop duplicates if seen in arbitrary 1 seconds watermark. Bounds are necessary so that Spark does not store ALL records in the stateval readings = spark
val readings = spark
    .readStream 
    .format("kafka")
    .option("kafka.bootstrap.servers", kafkaBootstrapServer)
    .option("subscribe", kafkaReadingsTopic)
    .load()
    .selectExpr("CAST(value AS STRING)")
    .select(from_json($"value", readingsSchema).as("data"))
    .select($"data.*")
    .withWatermark("reading_ts", kafkaDedupWatermarkTime) 
    .dropDuplicates()
    .filter($"reading_type" === 1) // Only take the "current load" measurement

readings: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [message_id: string, reading_ts: timestamp ... 5 more fields]


#### Peek at The Input Data Streams

##### Readings

In [6]:
val readingsQuery = readings.writeStream.format("memory").queryName("readings").start()
Thread.sleep(10000)
readingsQuery.status

readingsQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@101d3c83
res0: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Processing new data",
  "isDataAvailable" : true,
  "isTriggerActive" : true
}


In [14]:
spark.sql("select * from readings").show()

+----------+-------------------+-------------+------------+-------+------------+--------+
|message_id|         reading_ts|reading_value|reading_type|plug_id|household_id|house_id|
+----------+-------------------+-------------+------------+-------+------------+--------+
| 219686108|2013-09-02 10:48:20|          0.0|           1|      1|           0|       3|
| 220252836|2013-09-02 10:54:00|          0.0|           1|      0|           0|       4|
| 219853298|2013-09-02 10:50:00|          0.0|           1|      1|           0|       9|
| 221116806|2013-09-02 11:02:40|      136.521|           1|      1|           0|       1|
| 222516718|2013-09-02 11:16:40|       67.329|           1|      0|           0|       8|
| 222782967|2013-09-02 11:19:20|       60.253|           1|      0|           0|       8|
| 222849589|2013-09-02 11:20:00|          0.0|           1|      1|           0|       9|
| 224446503|2013-09-02 11:36:00|          0.0|           1|      2|           0|       7|
| 22524570

In [25]:
// readingsQuery.stop()
readingsQuery.lastProgress

res18: org.apache.spark.sql.streaming.StreamingQueryProgress =
{
  "id" : "f9ecee83-8f34-4875-b682-c10c7344564b",
  "runId" : "54100149-2487-44fa-8aff-76654e1807dd",
  "name" : "readings",
  "timestamp" : "2020-03-02T15:38:27.730Z",
  "batchId" : 4,
  "numInputRows" : 3126,
  "inputRowsPerSecond" : 212.01844818231146,
  "processedRowsPerSecond" : 19.3442988155794,
  "durationMs" : {
    "addBatch" : 161285,
    "getBatch" : 0,
    "getEndOffset" : 0,
    "queryPlanning" : 169,
    "setOffsetRange" : 4,
    "triggerExecution" : 161598,
    "walCommit" : 90
  },
  "eventTime" : {
    "avg" : "2013-09-02T17:03:07.236Z",
    "max" : "2013-09-02T17:43:00.000Z",
    "min" : "2013-09-02T16:29:40.000Z",
    "watermark" : "2013-09-02T16:48:20.000Z"
  },
  "stateOperators" : [ {
    "numRowsTotal...

In [26]:
readingsQuery.status

res19: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Processing new data",
  "isDataAvailable" : true,
  "isTriggerActive" : true
}


## Write to BigQuery

Write to BigQuery once a minute. BigQuery works better with batched data.

In [10]:
val ingestQuery = readings
    .writeStream
    .trigger(Trigger.ProcessingTime(outputTriggerTime))
    .foreachBatch{ 
        (batchDF: DataFrame, batchId: Long) =>
            batchDF.write.format("bigquery")
                .option("table", bigQueryTargetTable)
                .option("temporaryGcsBucket", bigQueryTempBucket)
                .mode(SaveMode.Append)
                .save()
    }.start()

ingestQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@7c3fcc66


In [27]:
ingestQuery.lastProgress

res20: org.apache.spark.sql.streaming.StreamingQueryProgress =
{
  "id" : "14686b54-a3bf-46b6-9561-1af1d282eb8e",
  "runId" : "6fb5db6e-b637-4f38-a55b-b80e0db28ad1",
  "name" : null,
  "timestamp" : "2020-03-02T15:38:13.069Z",
  "batchId" : 1,
  "numInputRows" : 11229,
  "inputRowsPerSecond" : 194.37087812224127,
  "processedRowsPerSecond" : 61.10854126418329,
  "durationMs" : {
    "addBatch" : 183051,
    "getBatch" : 0,
    "getEndOffset" : 0,
    "queryPlanning" : 177,
    "setOffsetRange" : 4,
    "triggerExecution" : 183755,
    "walCommit" : 475
  },
  "eventTime" : {
    "avg" : "2013-09-02T14:37:02.805Z",
    "max" : "2013-09-02T16:49:20.000Z",
    "min" : "2013-09-02T12:34:20.000Z",
    "watermark" : "1970-01-01T00:00:00.000Z"
  },
  "stateOperators" : [ {
    "numRowsTotal" :...

In [28]:
ingestQuery.status

res21: org.apache.spark.sql.streaming.StreamingQueryStatus =
{
  "message" : "Processing new data",
  "isDataAvailable" : true,
  "isTriggerActive" : true
}


In [None]:
// ingestQuery.stop()