In [None]:

from pyspark.sql import functions as F
from pyspark.sql import types as T

df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "localhost:9092") # kafka server
  .option("subscribe", "temperature") # topic
  .option("startingOffsets", "earliest") # start from beginning
  .load())

# create schema for temperature
temperature_schema = T.StructType([
    T.StructField("eventTime",T.StringType(),True),
    T.StructField("temperatura",T.FloatType(),True),
])

# extract temperature data and ensure `eventTime` is timestamp
df = (
    df.selectExpr("CAST(value as string)")
      .select(F.from_json(F.col("value"),temperature_schema).alias("json_value"))
      .selectExpr("json_value.*") # gives us a dataframe with columns (eventTime,temperatura)
      .select(
          F.expr("CAST(eventTime as timestamp)").alias("eventTime"),
          F.col("temperatura")
      )
      
)

# when using window you will get a range or value resembling [start,end]. 
# I have chosen the `start` for this example

windowedAvg = ( 
    df.withWatermark("eventTime", "5 minutes") 
      .groupBy(window(F.col("eventTime"), "5 minutes", "60 minutes").alias('eventTimeWindow'))
      .agg(F.avg("temperatura").alias("avgtemperature")) 
      .select(
          F.col("eventTimeWindow.start").alias("eventTime"),
          F.col("avgtemperature")
      )
)

# continue with your code to write to your various streams
query = windowedAvg\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'true')\
        .start()


# write on kafka topic avgtemperature
# here i've chosen as an example to use the eventTime as the key and the value to be the avgtemperature
qk = (windowedAvg 
        .select(
            F.expr("CAST(eventTime AS STRING)").alias("key"),
            F.expr("CAST(avgtemperature AS STRING)").alias("value")
        )
        .writeStream 
        .format("kafka") 
        .option("kafka.bootstrap.servers", "localhost:9092") 
        .option("checkpointLocation", "/home/kafka/Documenti/confluent/examples-6.1.0-post/clients/cloud/python/kafkaStream") 
        .option("topic", "avgtemperature")  
        .outputMode("complete") 
        .start())
        
query.awaitTermination()