## Baseline receiver stateless monitoring 

In [1]:
import os

os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages "
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.6,"
    "org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.6 "
    "pyspark-shell"
)

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("read_test_straeam") \
        .getOrCreate()

:: loading settings :: url = jar:file:/home/guest/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/guest/.ivy2/cache
The jars for the packages stored in: /home/guest/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-streaming-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-996d2afd-397b-4584-986b-7db8376b1193;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.6 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.6 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in c

In [3]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

KAFKA_BOOTSTRAP_SERVERS = "127.0.0.1:9092"
KAFKA_TOPIC = "sensors"


schema = StructType().add("sensor", StringType())\
                     .add("info", StructType()\
                          .add("timestamp_sent", StringType())\
                          .add("timestamp_received", StringType())\
                          .add("obs", StringType())\
                          .add("drift", StringType()))

df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
  .option("subscribe", KAFKA_TOPIC) \
  .option("startingOffsets", "earliest") \
  .load()\
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))\
            .select(col("parsed_value.sensor"), \
                    col("parsed_value.info.timestamp_sent"),\
                    col("parsed_value.info.timestamp_received"),\
                    col("parsed_value.info.obs"),\
                    col("parsed_value.info.drift"))

In [4]:
df.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- timestamp_sent: string (nullable = true)
 |-- timestamp_received: string (nullable = true)
 |-- obs: string (nullable = true)
 |-- drift: string (nullable = true)



In [7]:
from pyspark.sql.functions import window, avg, count, desc, stddev, min, max, isnull, col, sum, isnan, unix_timestamp, round

# Add time difference column first
df = df.withColumn(
    "time_diff",
    unix_timestamp("timestamp_received") - unix_timestamp("timestamp_sent")
)

eventAvg = df.groupBy(
    window(df.timestamp_received, "30 minutes"),
    df.sensor
).agg(
    count("obs").alias("count_obs"),
    round(avg("obs"),3).alias("avg_obs"),
    round(stddev("obs"),3).alias("+/-std_obs"),
    round(max("obs"),3).alias("max_obs"),
    round(min("obs"),3).alias("min_obs"),
    round(sum((isnull("obs") | isnan("obs")).cast("int")),3).alias("sum_miss_obs"),
    round(avg("time_diff"),3).alias("avg_time_delay")
    ).sort(desc("window"))

In [8]:
results = eventAvg.writeStream \
         .outputMode('complete')\
         .format("console") \
         .start() \
         .awaitTermination()

25/06/20 13:10:53 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-08b35d46-37d7-4a07-bbcb-72448c15ed43. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/06/20 13:10:53 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/06/20 13:10:53 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------+--------+---------+-------+----------+-------+-------+------------+---------+
|              window|  sensor|count_obs|avg_obs|+/-std_obs|max_obs|min_obs|sum_miss_obs|avg_delay|
+--------------------+--------+---------+-------+----------+-------+-------+------------+---------+
|{2025-06-20 12:30...|sensor-1|       90|  5.838|     9.651|  7.821| -0.087|           0|    0.056|
|{2025-06-20 12:30...|sensor-2|       42|  3.653|     2.865|  8.972| -1.129|          48|    0.056|
|{2025-06-20 12:30...|sensor-3|       90|   5.31|     8.857|  7.875| -0.148|           0|    0.056|
|{2025-06-20 12:00...|sensor-1|       26|  9.107|    12.195|  7.364| -0.425|           0|      0.0|
|{2025-06-20 12:00...|sensor-2|       26|  3.484|     3.202|   8.19| -1.129|           0|      0.0|
|{2025-06-20 12:00...|sensor-3|       26|   5.86|     7.709|  5.362| -0.577|           0|      0.0|
|{2

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/guest/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/guest/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/guest/anaconda3/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
results.stop()