## Baseline receiver stateless monitoring 

In [1]:
import os

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages "
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.6,"
    "org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.6 "
    "pyspark-shell"
)

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("read_test_stream") \
        .getOrCreate()

:: loading settings :: url = jar:file:/home/guest/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/guest/.ivy2/cache
The jars for the packages stored in: /home/guest/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-streaming-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2f18f364-bf11-423e-82da-d04c2eeb2333;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.6 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.6 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in c

In [3]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

KAFKA_BOOTSTRAP_SERVERS = "127.0.0.1:9092"
KAFKA_TOPIC = "sensors"


schema = StructType().add("sensor", StringType())\
                     .add("info", StructType()\
                          .add("timestamp_sent", StringType())\
                          .add("timestamp_received", StringType())\
                          .add("obs", StringType())\
                          .add("drift", StringType()))

df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
      .option("subscribe", KAFKA_TOPIC) \
      .option("startingOffsets", "earliest") \
      .load()\
      .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))\
                .select(col("parsed_value.sensor"), \
                        col("parsed_value.info.timestamp_sent"),\
                        col("parsed_value.info.timestamp_received"),\
                        col("parsed_value.info.obs"),\
                        col("parsed_value.info.drift"))

In [4]:
df.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- timestamp_sent: string (nullable = true)
 |-- timestamp_received: string (nullable = true)
 |-- obs: string (nullable = true)
 |-- drift: string (nullable = true)



In [5]:
WATERMARK_SIZE="1 minute"
WINDOW_SIZE="1 minute"

In [6]:
from pyspark.sql.functions import to_timestamp, to_utc_timestamp

df = df.withColumn("timestamp_received", to_timestamp("timestamp_received")) \
       .withColumn("timestamp_sent", to_timestamp("timestamp_sent")) \
       .withColumn("timestamp_received", to_utc_timestamp("timestamp_received", "Europe/Rome")) \
       .withColumn("timestamp_sent", to_utc_timestamp("timestamp_sent", "Europe/Rome")) \
       .withColumn("time_diff", (col("timestamp_received").cast("double") - col("timestamp_sent").cast("double")))\

df = df.withWatermark("timestamp_sent", WATERMARK_SIZE)

## PARAMETERS PROCESS-IN-CONTROL 

In [7]:
S1_MEAN = 2
S1_STD = 3

S2_MEAN = 3.5
S2_STD = 3

S3_MEAN = 1
S3_STD = 3

In [8]:
WARN_LEVEL = 2.5
ALARM_LEVEL = 3

In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

@udf(IntegerType())
def is_out_control(sensor, avg_val):
    " UDF for SPC rules"
    thresholds = {
        "sensor-1": (S1_MEAN, S1_STD),
        "sensor-2": (S2_MEAN, S2_STD),
        "sensor-3": (S3_MEAN, S3_STD),
    }
    if sensor not in thresholds or avg_val is None:
        return False   
    mean, std = thresholds[sensor]
    # WARNING LEVEL
    upper_warn =  mean + WARN_LEVEL * std
    lower_warn = mean - WARN_LEVEL * std
    # ALARM LEVEL
    upper_alarm =  mean + ALARM_LEVEL * std
    lower_alarm = mean - ALARM_LEVEL * std
    state = 0
    if avg_val > upper_alarm or avg_val < lower_alarm:
        return 2 
    elif avg_val > upper_warn or avg_val < lower_warn:
        return 1
    return state

In [10]:
from pyspark.sql.functions import window, avg, count, desc, stddev, min, max
from pyspark.sql.functions import isnull, col, sum, isnan, unix_timestamp, round
from pyspark.sql.functions import date_format, when, lit

eventSensors = df.groupBy(window(df.timestamp_sent, WINDOW_SIZE), df.sensor)\
    .agg(
        count("obs").alias("count"),
        round(avg("obs"), 3).alias("avg"),
        round(stddev("obs"), 3).alias("std"),
        round(max("obs"), 3).alias("max"),
        round(min("obs"), 3).alias("min"),
        round((max("obs") - min("obs")), 3).alias("range"),
        round(sum((isnull("obs") | isnan("obs")).cast("int")), 3).alias("count_miss"),
        round(max("time_diff"), 3).alias("max_delay"))\
        .withColumn("is_out_control", is_out_control(col("sensor"), col("avg"))) \
        .withColumn("window_start", col("window.start")) \
        .withColumn("window_end", col("window.end"))\
        .withColumn("date", date_format(col("window_start"), "dd"))\
        .withColumn("hour", date_format(col("window_start"), "HH"))\
        .drop("window")

In [None]:
streaming = eventSensors.writeStream \
        .outputMode("append") \
        .format("csv") \
        .option("path", "/home/guest/notebooks/data/output/") \
        .option("checkpointLocation", "/home/guest/notebooks/data/checkpoints") \
        .option("header", "true") \
        .partitionBy("date", "hour") \
        .start() \
        .awaitTermination()
#

25/07/04 12:30:07 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/07/04 12:30:07 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.

In [None]:
streaming.stop()

## Coalesce

In [None]:
final_df = spark.read.csv("/home/guest/notebooks/data/output/day=*/hour=*", header=True, inferSchema=True)
final_df.take(5)

In [None]:
final_df.orderBy(col("window_start").asc()).coalesce(1).write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv("/home/guest/notebooks/final_output/")