In [2]:
!pip install azure-eventhub==5.11.4

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 4, Finished, Available)



**Utility function to return streaming handle**

In [3]:
import pyspark.sql.functions as F
from notebookutils import mssparkutils

def streamTableToEventHub(filePath,fileFormat,fileSchema,checkpointDir,removeCheckpointDirFlg,connString,interval,maxBytesTrigger):
    global ehConf
    if removeCheckpointDirFlg:
        mssparkutils.fs.rm(checkpointDir,True)    
    ehConf = {}
    ehConf['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connString)
    
    if fileFormat !='delta':     # fileSchema parameter is mandatory, if source is not delta table
        dfs =spark.readStream.format(fileFormat).schema(fileSchema).option('maxBytesPerTrigger',maxBytesTrigger).load(filePath)
    else:        
        dfs =spark.readStream.format(fileFormat).option('maxBytesPerTrigger',maxBytesTrigger).load(filePath)
    
    query=dfs.withColumn('body', F.to_json(F.struct(*dfs.columns),options={"ignoreNullFields": False}))\
        .select('body')\
        .writeStream\
        .trigger(processingTime=f'{interval} second')\
        .format("eventhubs")\
        .options(**ehConf)\
        .queryName('SendToEventHubs')\
        .option("checkpointLocation",checkpointDir)
    strmHandle=query.start()
    return strmHandle

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 5, Finished, Available)

**Streaming to Event streams**

**Assigning parameter values**

In [4]:
#Copy your Event Hubs conenction string here:
ehConnectionString = ""

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 6, Finished, Available)

**Option1**: File source

In [10]:
df=spark.read.format('parquet').load('Files/Landing/PARQUET/Sales')
vFileSchema=df.schema
vFileFormat='parquet'
vFileName='Sales'
vFilePath = "Files/Landing/PARQUET/Sales"
vCheckPointDir=f"Files/Checkpoints/{vFileName}"
vBatchFrequency=30

StatementMeta(, 1a95294a-a58e-4a46-924d-3d2ce58fc451, 12, Finished, Available)

**Option2**: Table source

In [5]:
vFileSchema=""    #schema is not required for delta tables
vFileFormat='delta'
vFileName='StockExchangeSample'
vFilePath = "Tables/StockExchangeSample"
vCheckPointDir=f"Files/Checkpoints/{vFileName}"
vBatchFrequency=30

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 7, Finished, Available)

**Start the streaming**

In [6]:
handle=streamTableToEventHub(filePath=vFilePath,\
    fileFormat=vFileFormat,\
    fileSchema=vFileSchema,\
    checkpointDir=vCheckPointDir,\
    removeCheckpointDirFlg=True,\
    connString=ehConnectionString,\
    interval=vBatchFrequency,\
    maxBytesTrigger=5000)

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 8, Finished, Available)

In [9]:
handle.status

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 11, Finished, Available)

{'message': 'Waiting for next trigger',
 'isDataAvailable': True,
 'isTriggerActive': False}

In [None]:
handle.lastProgress

In [None]:
import time

while handle.status['message'] in ['Waiting for next trigger','Processing new data'] and handle.lastProgress:
    print('batchId:',handle.lastProgress['batchId']\
    ,',timestamp:',handle.lastProgress['timestamp']\
    ,',numInputRows:',handle.lastProgress['numInputRows']\
    ,',inputRowsPerSecond:',round(handle.lastProgress['inputRowsPerSecond'])\
    ,',processedRowsPerSecond:',round(handle.lastProgress['processedRowsPerSecond'])\
    )
    time.sleep(vBatchFrequency)

In [12]:
handle.stop()

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 14, Finished, Available)

**Using foreachBatch destination for additional logging**

In [13]:

import pyspark.sql.functions as F
from notebookutils import mssparkutils

def logBatchProgress(df,batchId):
    df.write.format("eventhubs").options(**ehConf).save()
        
    df.agg(F.count('*').alias('rowCnt'))\
        .withColumn('BatchId',F.lit(batchId))\
        .withColumn('batchTimeStamp',F.current_timestamp())\
        .write.mode('append')\
        .format('delta').saveAsTable('streamingprogress')
    pass

def streamTableToEventHubV2(filePath,fileFormat,fileSchema,checkpointDir,removeCheckpointDirFlg,connString,interval,maxBytesTrigger):
    global ehConf
    if removeCheckpointDirFlg:
        mssparkutils.fs.rm(checkpointDir,True)    
    ehConf = {}
    ehConf['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connString)
    if fileFormat !='delta':     # fileSchema parameter is mandatory, if source is not delta table
        dfs =spark.readStream.format(fileFormat).schema(fileSchema).option('maxBytesPerTrigger',maxBytesTrigger).load(filePath)
    else:        
        dfs =spark.readStream.format(fileFormat).option('maxBytesPerTrigger',maxBytesTrigger).load(filePath)

    query=dfs.withColumn('body', F.to_json(F.struct(*dfs.columns),options={"ignoreNullFields": False}))\
        .select('body')\
        .writeStream\
        .trigger(processingTime=f'{interval} second')\
        .foreachBatch(logBatchProgress)\
        .queryName('SendToEventHubs')\
        .option("checkpointLocation",checkpointDir)

    strmHandle=query.start()
    return strmHandle

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 15, Finished, Available)

In [14]:
vFileSchema=""    #schema is not required
vFileFormat='delta'
vFileName='StockExchangeSample'
vFilePath = "Tables/StockExchangeSample"
vCheckPointDir=f"Files/Checkpoints/{vFileName}"
vBatchFrequency=30

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 16, Finished, Available)

**Streaming with logged event generator**

In [15]:
handle=streamTableToEventHubV2(filePath=vFilePath,\
    fileFormat=vFileFormat,\
    fileSchema=vFileSchema,
    checkpointDir=vCheckPointDir,\
    removeCheckpointDirFlg=True,\
    connString=ehConnectionString,\
    interval=vBatchFrequency,\
    maxBytesTrigger=50000)

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 17, Finished, Available)

In [16]:
handle.status

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 18, Finished, Available)

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [None]:
handle.lastProgress

In [20]:
handle.stop()

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 22, Finished, Available)

In [19]:
%%sql
select * from streamingprogress order by batchid;
-- delete from streamingprogress

StatementMeta(, 010193a5-b999-432b-8127-d1e2c74c227b, 21, Finished, Available)

<Spark SQL result set with 3 rows and 3 fields>

In [9]:
%%sql
delete from streamingprogress

StatementMeta(, 1a95294a-a58e-4a46-924d-3d2ce58fc451, 11, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>