In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [0]:
%run ./GenerateStreamingData

### UC setup

In [0]:
%sql
USE CATALOG learn_adb_fikrat;
create schema if not exists bronze;
create schema if not exists silver;
Use bronze;

In [0]:
%sql
drop table if exists iot_measurements_tmp;
drop table if exists iot_measurements;
drop table if exists silver.iot_measurements_aggregated


In [0]:
checkpoint_root_path = "/Volumes/learn_adb_fikrat/bronze/ext_landing_volume/streaming-checkpoints/eventhub"
checkpoint_path_sensor=f'{checkpoint_root_path}/iot_measurements'
checkpoint_path_sensor1=f'{checkpoint_root_path}/iot_measurements1'
checkpoint_path_sensor2=f'{checkpoint_root_path}/iot_measurements2'
checkpoint_path_sensor3=f'{checkpoint_root_path}/iot_measurements3'

In [0]:
dbutils.fs.rm(checkpoint_path_sensor, True)
dbutils.fs.rm(checkpoint_path_sensor1, True)
dbutils.fs.rm(checkpoint_path_sensor2, True)
dbutils.fs.rm(checkpoint_path_sensor3, True)

### Event Hub configurations

In [0]:
connection_string_ehs = dbutils.secrets.get(scope = "fikrats_study_scope", key = "eh-dbr-source-connstr")

In [0]:
event_hub_namespace="eh-dbr"
source_event_hub_name="dbr-source"
sink_event_hub_name="dbr-target"
uc_name="learn_adb_fikrat"
temp_table_name=f'bronze.iot_measurements_tmp'

In [0]:
generate_sensor_data(datetime(2025, 1, 1), datetime(2025, 2, 15),connection_string_ehs,
                     source_event_hub_name,checkpoint_path_sensor,temp_table_name)

### Reading from Event Hub

Common Event Hub properties:
- connectionString
- maxEventsPerTrigger
- startingPosition
- endingPosition 
- consumerGroup
(see 
https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md for more details)

In [0]:
ehConf={}
ehConf['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connection_string_ehs)
ehConf['eventhubs.eventHubName']=source_event_hub_name


In [0]:
df=spark.readStream.format("eventhubs").options(**ehConf).load()
display(df)

In [0]:
startingEventPosition = {
  "offset": None,  
  "seqNo": 1711618,            #not in use
  "enqueuedTime": None,   #not in use
  "isInclusive": True
}

endingEventPosition = {
  "offset": None,  
  "seqNo": 1711634,            #not in use
  "enqueuedTime": None,   #not in use
  "isInclusive": True
}

ehConf['startingPosition'] = startingEventPosition
ehConf['endingPosition'] = endingEventPosition
dfPart=spark.readStream.format("eventhubs").options(**ehConf).load()
display(dfPart)

In [0]:
display(df.groupBy("partition","partitionKey").count())

In [0]:
df1=df.withColumn("event_payload", F.col("body").cast("string"))\
    .select("partition","offset","enqueuedTime","event_payload")
display(df1)    


In [0]:
iot_schema = StructType([
    StructField('EventTime', StringType(), True), 
    StructField('Office', StringType(), True),
    StructField('Measurements', 
                ArrayType(
                    StructType([
                        StructField('Sensor', StringType(), True),
                        StructField('Measurement', StringType(), True), 
                        StructField('Value', DoubleType(), True)
                    ])
                ), 
                True)
])

df2 = df1.withColumn("EventPayload", F.from_json("event_payload", iot_schema)) \
         .withColumn("Measurements", F.explode("EventPayload.Measurements")) \
         .selectExpr("enqueuedTime",
                     "current_timestamp() as ProcessingTime", 
                     "offset as ProcessingOffset",
                     "partition","offset",
                     "EventPayload.EventTime as EventTime",
                     "EventPayload.Office as Office","Measurements.*")\
        .writeStream.format('delta')\
        .option("checkpointLocation",checkpoint_path_sensor1)\
        .toTable('bronze.iot_measurements')                         

In [0]:
%sql
select * from bronze.iot_measurements

In [0]:
spark.readStream.table('iot_measurements')\
    .groupBy('Office','Sensor','Measurement',F.window('EventTime',' 1 hour'))\
    .agg(F.avg('Value').alias('AvgValue'))\
    .select('Office','Sensor','Measurement',
            F.col('window.start').alias("EventTimeWindowStart"),
            F.round('AvgValue',1).alias('AvgValue'))\
    .writeStream.format('delta')\
         .outputMode('complete')\
         .option("checkpointLocation",checkpoint_path_sensor2)\
         .toTable('silver.iot_measurements_aggregated')             

In [0]:
%sql
select * from silver.iot_measurements_aggregated

In [0]:
df3=spark.readStream.table('silver.iot_measurements_aggregated')\
    .select(F.to_json(F.struct("EventTimeWindowStart","Office","Measurement", "AvgValue")).alias("body"),
    F.col('Office').alias("partitionKey"))    

display(df3)        

In [0]:
connection_string_eht = dbutils.secrets.get(scope = "fikrats_study_scope", key = "eh-dbr-target-connstr")
ehConf={}
ehConf['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connection_string_eht)
ehConf['eventhubs.eventHubName']=sink_event_hub_name

In [0]:
df3.writeStream.format("eventhubs")\
        .format("eventhubs")\
        .options(**ehConf)\
        .option("checkpointLocation", checkpoint_path_sensor3)\
        .start()