In [1]:
#Cargamos las librerias
from pyspark.sql.functions import from_json, col, to_timestamp, window, max
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

In [2]:
#creamos la session de spark
spark = SparkSession \
        .builder \
        .master("local[3]") \
        .appName("Sliding Window Demo") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 1) \
        .getOrCreate()

In [3]:
#declaramos la estructura del sensor
schema = StructType([
        StructField("CreatedTime", StringType()),
        StructField("Reading", DoubleType())
    ])

In [4]:
#creamos el objeto de lectura
kafka_source_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "sensor") \
        .option("startingOffsets", "earliest") \
        .load()

In [5]:
value_df = kafka_source_df.select(col("key").cast("string").alias("SensorID"),
                                      from_json(col("value").cast("string"), schema).alias("value"))

In [6]:
sensor_df = value_df.select("SensorID", "value.*") \
        .withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss"))

In [7]:
agg_df = sensor_df \
        .withWatermark("CreatedTime", "30 minute") \
        .groupBy(col("SensorID"),
                 window(col("CreatedTime"), "15 minute", "5 minute")) \
        .agg(max("Reading").alias("MaxReading"))

In [8]:
output_df = agg_df.select("SensorID", "window.start", "window.end", "MaxReading")

In [10]:
window_query = output_df.writeStream \
        .format("console") \
        .outputMode("complete") \
        .option("checkpointLocation", "./checkpoint/sliding-window") \
        .trigger(processingTime="30 second") \
        .start()

In [None]:
window_query.awaitTermination()