**Reading from Event Hub and persisting in the Bronze table**

In [5]:
!pip install azure-eventhub

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 7, Finished, Available)

Collecting azure-eventhub
  Downloading azure_eventhub-5.11.5-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: azure-eventhub
Successfully installed azure-eventhub-5.11.5


In [4]:
connectionString = "Endpoint=;EntityPath="
ehConf = {}
ehConf['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString)


StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 6, Finished, Available)

In [6]:
from azure.eventhub import EventHubConsumerClient
from notebookutils import mssparkutils
import pyspark.sql.functions as f
from pyspark.sql.types import * 

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 8, Finished, Available)

In [58]:
tableName='CarTraffic_Raw'
tablePath='Tables/'+tableName
checkpointFolder="Files/Checkpoints/"+tableName

dfRaw=spark.readStream.format("eventhubs").options(**ehConf).load()

strmHandle =dfRaw.withColumn("bodyAsString", f.col("body").cast("string"))\
  .writeStream\
  .queryName('BrzTrafficStream')\
  .format("delta")\
  .option("checkpointLocation", checkpointFolder)\
  .outputMode("append")\
  .option("path", tablePath).start()

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 60, Finished, Available)

**Read stream's status and last progress**

In [60]:
print (f'Is the stream active?:{strmHandle.isActive}',f', Stream status:{strmHandle.status}')
strmHandle.recentProgress
# strmHandle.stop()

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 62, Finished, Available)

Is the stream active?:True , Stream status:{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}


[{'id': '39acb6f8-b3ce-4897-ad74-3cb1d917e1ba',
  'runId': '64db24ff-abe0-471e-973e-753e79cb0304',
  'name': 'BrzTrafficStream',
  'timestamp': '2024-02-07T18:48:53.668Z',
  'batchId': 12,
  'numInputRows': 6,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 1.5155342258145996,
  'durationMs': {'addBatch': 2723,
   'getBatch': 6,
   'getOffset': 9,
   'queryPlanning': 5,
   'triggerExecution': 3959,
   'walCommit': 494},
  'stateOperators': [],
  'sources': [{'description': 'org.apache.spark.sql.eventhubs.EventHubsSource@6ca0f4b8',
    'startOffset': {'fa-eh': {'0': 60448}},
    'endOffset': {'fa-eh': {'0': 60454}},
    'latestOffset': {'fa-eh': {'0': 60454}},
    'numInputRows': 6,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 1.5155342258145996}],
  'sink': {'description': 'DeltaSink[Tables/CarTraffic_Raw]',
   'numOutputRows': -1}},
 {'id': '39acb6f8-b3ce-4897-ad74-3cb1d917e1ba',
  'runId': '64db24ff-abe0-471e-973e-753e79cb0304',
  'name': 'BrzTrafficStream',


In [61]:
%%sql
select count(*) from CarTraffic_Raw

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 63, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

In [62]:
%%sql
select * from CarTraffic_Raw

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 64, Finished, Available)

<Spark SQL result set with 1000 rows and 10 fields>

### **Streaming transformations**

**Parsing JSON payload, Applying filter transformation and writing to another table**

In [63]:
events_schema = StructType([
    StructField("EntryTime", TimestampType(), True),
    StructField("CarModel", StructType([
        StructField("Make", StringType(), True),
        StructField("Model", StringType(), True),
        StructField("VehicleType", IntegerType(), True),
        StructField("VehicleWeight", IntegerType(), True)
    ]), True),
    StructField("State", StringType(), True),
    StructField("TollAmount", IntegerType(), True),
    StructField("Tag", IntegerType(), True),
    StructField("TollId", IntegerType(), True),
    StructField("LicensePlate", StringType(), True),
    StructField("EventProcessedUtcTime", TimestampType(), True),
    StructField("PartitionId", IntegerType(), True),
    StructField("EventEnqueuedUtcTime", TimestampType(), True)
])

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 65, Finished, Available)

In [51]:
# mssparkutils.fs.rm (checkpointFolder,True)

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 53, Finished, Available)

True

In [68]:
tableName='CarTraffic_Silver'
deltaTablePath = "Tables/" + tableName
checkpointFolder="Files/Checkpoints/"+tableName

strmHandleSlv = spark.readStream\
.table("CarTraffic_Raw")\
.withColumn("eventsParsed",f.from_json(f.col('bodyAsString'),events_schema))\
.select('eventsParsed.*')\
.select('*','CarModel.*')\
.filter("State='TX' or State='NJ'")\
.drop('CarModel')\
.writeStream\
.queryName('SlvTrafficStream')\
.outputMode("append")\
.format("delta")\
.option("path", deltaTablePath)\
.option("checkpointLocation", checkpointFolder).start()

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 70, Finished, Available)

In [67]:
# print (f'Is the stream active?:{strmHandleSlv.isActive}',f', Stream status:{strmHandleSlv.status}')
# strmHandleSlv.stop()

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 69, Finished, Available)

In [69]:
%%sql
select * from CarTraffic_Silver

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 71, Finished, Available)

<Spark SQL result set with 1000 rows and 13 fields>

### **Aggregations**

**Simple aggregations,Notice the usage of 'complete' write mode**

In [70]:
tableName='CarTraffic_Gold'
deltaTablePath = "Tables/" + tableName
checkpointFolder="Files/Checkpoints/"+tableName

strmHandleGld = spark.readStream\
.table("CarTraffic_Silver")\
.groupBy('Make')\
.agg(f.count('Model').alias('CarCount'))\
.writeStream\
.queryName('GldTrafficStream')\
.outputMode("complete")\
.format("delta")\
.option("path", deltaTablePath)\
.option("checkpointLocation", checkpointFolder).start()

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 72, Finished, Available)

In [71]:
%%sql
select * from CarTraffic_Gold

StatementMeta(, aea86997-8120-46ab-be89-c04b5e1a433d, 73, Finished, Available)

<Spark SQL result set with 5 rows and 2 fields>

**Using Window functions**

In [None]:
from notebookutils import mssparkutils
mssparkutils.fs.rm (checkpointFolder,True)

In [1]:
from pyspark.sql.functions import *
tableName='CarTraffic_Gold2'
deltaTablePath = "Tables/" + tableName
checkpointFolder="Files/Checkpoints/"+tableName

strmHandleGld2 = spark.readStream\
.table("CarTraffic_Silver")\
.groupBy("Make",window('EntryTime', "6 seconds", "3 seconds"))\
.agg(count('Model').alias('CarCount'))\
.writeStream\
.queryName('GldTrafficStream2')\
.outputMode("complete")\
.format("delta")\
.option("path", deltaTablePath)\
.option("checkpointLocation", checkpointFolder).start()

StatementMeta(, ab0772f9-3dfc-43ba-9d8b-45b1ee9d5a35, 3, Finished, Available)

In [96]:
# strmHandleGld2.stop()

StatementMeta(, 1de2e52b-90a4-4973-aa61-510d11bc639c, 98, Finished, Available)

In [2]:
%%sql
select Make,window.start,window.end,CarCount from CarTraffic_Gold2 order by Make,start

StatementMeta(, ab0772f9-3dfc-43ba-9d8b-45b1ee9d5a35, 4, Finished, Available)

<Spark SQL result set with 22 rows and 4 fields>

**Aggregations with watermarking**

In [106]:
from pyspark.sql.functions import *
tableName='CarTraffic_Gold3'
deltaTablePath = "Tables/" + tableName
checkpointFolder="Files/Checkpoints/"+tableName

strmHandleGld3 = spark.readStream\
.table("CarTraffic_Silver")\
.withWatermark("EntryTime", "2 seconds") \
.groupBy("Make",window('EntryTime', "6 seconds", "3 seconds"))\
.agg(count('Model').alias('CarCount'))\
.writeStream\
.queryName('GldTrafficStream3')\
.outputMode("complete")\
.format("delta")\
.option("path", deltaTablePath)\
.option("checkpointLocation", checkpointFolder).start()

StatementMeta(, 1de2e52b-90a4-4973-aa61-510d11bc639c, 108, Finished, Available)

In [108]:
%%sql
select Make,window.start,window.end,CarCount from CarTraffic_Gold3 order by Make,start

StatementMeta(, 1de2e52b-90a4-4973-aa61-510d11bc639c, 110, Finished, Available)

<Spark SQL result set with 22 rows and 4 fields>

In [10]:
# strmHandle=query.start()
strmHandleGld3.stop()

StatementMeta(, 8de4c5a6-9fac-411c-8fb6-19f4383a8a66, 12, Finished, Available)

### **JOINS**

In [None]:
from notebookutils import mssparkutils
mssparkutils.fs.rm ('Files/Checkpoints/CarTraffic_Silver_Joined',True)

StatementMeta(, , , Waiting, )

True

In [9]:
tableName='CarTraffic_Silver_Joined'
deltaTablePath = "Tables/" + tableName
checkpointFolder="Files/Checkpoints/"+tableName

df1 = spark.readStream.table("CarTraffic_Silver").alias('ct1')
df2 = spark.readStream.table("CarTraffic_Silver").withWatermark("EntryTime", "3 seconds").alias('ct2') 
CT_Joined_handle=df1\
.join(df2,f.expr("""ct1.LicensePlate=ct2.LicensePlate
   AND ct1.EntryTime<= ct2.EntryTime + interval 1 seconds"""))\
.selectExpr('ct1.TollId as TollId1','ct2.TollId as TollId2','ct1.Make','ct1.TollAmount')\
.writeStream\
.queryName(tableName+"-Stream")\
.outputMode("append")\
.format("delta")\
.option("path", deltaTablePath)\
.option("checkpointLocation", checkpointFolder).start()

StatementMeta(, bf5d4d9e-1048-497a-a6f0-a204282114ca, 11, Finished, Available)

In [10]:
# CT_Joined_handle.status
# CT_Joined_handle.stop()

StatementMeta(, bf5d4d9e-1048-497a-a6f0-a204282114ca, 12, Finished, Available)

In [None]:
%%sql
select * from CarTraffic_Silver_Joined

StatementMeta(, , , Waiting, )

<Spark SQL result set with 1000 rows and 4 fields>

### **Exploring Trigger options**

In [70]:
from notebookutils import mssparkutils
# mssparkutils.fs.rm ('Files/Checkpoints/CarTraffic_Silver2',True)
# mssparkutils.fs.rm ('Files/Checkpoints/CarTraffic_Silver3',True)
# mssparkutils.fs.rm ('Files/Checkpoints/CarTraffic_Pivoted4',True)

In [7]:
dfStrm = spark.readStream.table("CarTraffic_Silver")

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 9, Finished, Available)

**Fixed time intervals**

In [10]:
tableName='CarTraffic_Silver5'
deltaTablePath = "Tables/" + tableName
checkpointFolder="Files/Checkpoints/"+tableName

strmHandle2 = dfStrm.withColumn('ProcessingTime',f.current_timestamp())\
.writeStream\
.queryName('CarTraffic_Silver2')\
.outputMode("append")\
.trigger(processingTime="10 second")\
.format("delta")\
.option("path", deltaTablePath)\
.option("checkpointLocation", checkpointFolder).start()

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 12, Finished, Available)

In [13]:
%%sql
select ProcessingTime,count(*) as RowCnt from CarTraffic_Silver5 group by ProcessingTime order by ProcessingTime

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 15, Finished, Available)

<Spark SQL result set with 1 rows and 2 fields>

**Optional: blocking further execution until streaming completes**

In [73]:
strmHandle2.awaitTermination()

StatementMeta(, cef74fa2-fd7f-43d4-9701-1dbacfbaa8db, 75, Finished, Cancelled)

**Run once and stop**

In [16]:
tableName='CarTraffic_Silver3'
deltaTablePath = "Tables/" + tableName
checkpointFolder="Files/Checkpoints/"+tableName

strmHandle3 = dfStrm.writeStream\
.queryName('runOnce')\
.outputMode("append")\
.trigger(availableNow=True)\
.format("delta")\
.option("path", deltaTablePath)\
.option("checkpointLocation", "Files/Checkpoints/"+tableName).start()

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 18, Finished, Available)

In [17]:
print (f'Is the stream active?:{strmHandle3.isActive}',f', Stream status:{strmHandle3.status}')
# strmHandle3.stop()

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 19, Finished, Available)

Is the stream active?:False , Stream status:{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}


**Using ForeachBatch** 

In [18]:
def pivotEachBatch(df,batch_id):
   df.groupBy('State').pivot('Make').sum('TollAmount')\
   .withColumn('ProcessedTime',current_timestamp())\
   .write.format('delta').mode('append')\
   .saveAsTable('CarTraffic_Pivoted4')
   pass

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 20, Finished, Available)

In [19]:
tableName='CarTraffic_Pivoted4'
deltaTablePath = "Tables/" + tableName
checkpointFolder="Files/Checkpoints/"+tableName
strmHandle4 = dfStrm.writeStream\
.trigger(processingTime='1 seconds')\
.queryName('PivotingStrm')\
.option("checkpointLocation", "Files/Checkpoints/"+tableName)\
.foreachBatch(pivotEachBatch).start()

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 21, Finished, Available)

In [20]:
%%sql
select * from CarTraffic_Pivoted4

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 22, Finished, Available)

<Spark SQL result set with 3 rows and 7 fields>

**Stop all active streams**

In [21]:
active_streams = spark.streams.active

for stream in active_streams:
    print(stream.name)
    stream.stop()

StatementMeta(, 665538dc-c0b2-4654-816b-fcb45f6b2bc0, 23, Finished, Available)

CarTraffic_Silver2
