## Ingesting from File sources ##

Reading the data schema- optional (requires files in the source folder)

In [19]:
df = spark.read.format("parquet").load("Files/Landing/PARQUET/Sales/*")
sales_schema=df.schema
print (sales_schema)

StatementMeta(, 66e6f8d2-ebb3-4d39-ac2c-7ae4eb29db71, 21, Finished, Available)

StructType([StructField('SaleKey', LongType(), True), StructField('CityKey', IntegerType(), True), StructField('CustomerKey', IntegerType(), True), StructField('BillToCustomerKey', IntegerType(), True), StructField('StockItemKey', IntegerType(), True), StructField('InvoiceDateKey', TimestampType(), True), StructField('DeliveryDateKey', TimestampType(), True), StructField('SalespersonKey', IntegerType(), True), StructField('WWIInvoiceID', IntegerType(), True), StructField('Description', StringType(), True), StructField('Package', StringType(), True), StructField('Quantity', IntegerType(), True), StructField('UnitPrice', DecimalType(18,2), True), StructField('TaxRate', DecimalType(18,3), True), StructField('TotalExcludingTax', DecimalType(18,2), True), StructField('TaxAmount', DecimalType(18,2), True), StructField('Profit', DecimalType(18,2), True), StructField('TotalIncludingTax', DecimalType(18,2), True), StructField('TotalDryItems', IntegerType(), True), StructField('TotalChillerItems

Assigning schema variable

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,TimestampType,DecimalType
sales_schema=StructType([StructField('SaleKey', LongType(), True), StructField('CityKey', IntegerType(), True), StructField('CustomerKey', IntegerType(), True), StructField('BillToCustomerKey', IntegerType(), True), StructField('StockItemKey', IntegerType(), True), StructField('InvoiceDateKey', TimestampType(), True), StructField('DeliveryDateKey', TimestampType(), True), StructField('SalespersonKey', IntegerType(), True), StructField('WWIInvoiceID', IntegerType(), True), StructField('Description', StringType(), True), StructField('Package', StringType(), True), StructField('Quantity', IntegerType(), True), StructField('UnitPrice', DecimalType(18,2), True), StructField('TaxRate', DecimalType(18,3), True), StructField('TotalExcludingTax', DecimalType(18,2), True), StructField('TaxAmount', DecimalType(18,2), True), StructField('Profit', DecimalType(18,2), True), StructField('TotalIncludingTax', DecimalType(18,2), True), StructField('TotalDryItems', IntegerType(), True), StructField('TotalChillerItems', IntegerType(), True), StructField('LineageKey', IntegerType(), True)])


StatementMeta(, , , Waiting, )

Cleaning checkpoint folder if exists

In [20]:
from notebookutils import mssparkutils
mssparkutils.fs.rm ('Files/Checkpoints/Sales',True)

StatementMeta(, 66e6f8d2-ebb3-4d39-ac2c-7ae4eb29db71, 22, Finished, Available)

True

Run this only if table exists, to have a fresh start

In [49]:
%%sql
DROP TABLE IF EXISTS Sales

StatementMeta(, , , Waiting, )

<Spark SQL result set with 0 rows and 0 fields>

**Reading from source and writing into Lakehouse using streaming**

In [21]:
from pyspark.sql.functions import current_timestamp,input_file_name
tableName = "Sales"
deltaTablePath = "Tables/" + tableName

dfStrm = spark.readStream\
.schema(sales_schema)\
.option("maxFilesPerTrigger", "1")\
.parquet("Files/Landing/PARQUET/Sales/*.parquet")\
.withColumn('IngestionTime',current_timestamp())\
.withColumn('SourceFileName',input_file_name())

query = dfStrm.writeStream\
.queryName('SalesStream')\
.outputMode("append")\
.format("delta")\
.option("path", deltaTablePath)\
.option("checkpointLocation", "Files/Checkpoints/"+tableName)

StatementMeta(, , , Waiting, )

**Starting the stream**

In [None]:
# strmHandle=query.start()
strmHandle.stop()

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 20, Finished, Available)

**Reading stream's status and last progress**

In [54]:
print (f'Is the stream active?:{strmHandle.isActive}',f', Stream status:{strmHandle.status}')
print (f'Last progress: {strmHandle.lastProgress}')

StatementMeta(, , , Waiting, )

Is the stream active?:True , Stream status:{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}
Last progress: {'id': 'd666aa7c-59ed-491b-a921-31e1098bc978', 'runId': '0f9bde9d-dd41-4ea2-bda1-e20d4334cc3c', 'name': 'SalesStream', 'timestamp': '2024-02-03T22:47:46.961Z', 'batchId': 5, 'numInputRows': 1000000, 'inputRowsPerSecond': 10638297.872340426, 'processedRowsPerSecond': 147015.5836518671, 'durationMs': {'addBatch': 6012, 'getBatch': 44, 'latestOffset': 310, 'queryPlanning': 9, 'triggerExecution': 6802, 'walCommit': 209}, 'stateOperators': [], 'sources': [{'description': 'FileStreamSource[abfss://46df290c-a4bf-4f21-ae5b-b2a4f313e3d8@onelake.dfs.fabric.microsoft.com/07a9d45a-7cef-4ec8-aa73-60477164a94a/Files/Landing/PARQUET/Sales/*.parquet]', 'startOffset': {'logOffset': 4}, 'endOffset': {'logOffset': 5}, 'latestOffset': None, 'numInputRows': 1000000, 'inputRowsPerSecond': 10638297.872340426, 'processedRowsPerSecond': 147015.5836518671}], 'sink': {'de

In [55]:
print (f'Recent progress: {strmHandle.recentProgress}')

StatementMeta(, , , Waiting, )

Recent progress: [{'id': 'd666aa7c-59ed-491b-a921-31e1098bc978', 'runId': '0f9bde9d-dd41-4ea2-bda1-e20d4334cc3c', 'name': 'SalesStream', 'timestamp': '2024-02-03T22:43:57.121Z', 'batchId': 0, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0, 'durationMs': {'latestOffset': 27, 'triggerExecution': 83}, 'stateOperators': [], 'sources': [{'description': 'FileStreamSource[abfss://46df290c-a4bf-4f21-ae5b-b2a4f313e3d8@onelake.dfs.fabric.microsoft.com/07a9d45a-7cef-4ec8-aa73-60477164a94a/Files/Landing/PARQUET/Sales/*.parquet]', 'startOffset': None, 'endOffset': None, 'latestOffset': None, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0}], 'sink': {'description': 'DeltaSink[Tables/Sales]', 'numOutputRows': -1}}, {'id': 'd666aa7c-59ed-491b-a921-31e1098bc978', 'runId': '0f9bde9d-dd41-4ea2-bda1-e20d4334cc3c', 'name': 'SalesStream', 'timestamp': '2024-02-03T22:44:07.204Z', 'batchId': 0, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRo

**Listing active streams**

In [None]:
active_streams = spark.streams.active

for stream in active_streams:
    print(stream.name)
    # stream.stop()

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 21, Finished, Available)

Validating destination table

In [61]:
%%sql
select SourceFileName,count(*) As RowCnt from sales group by SourceFileName

StatementMeta(, , , Waiting, )

<Spark SQL result set with 38 rows and 2 fields>

**Automating recent progress reads**

In [None]:
import json
from pyspark.sql.functions  import col
progressJson=json.dumps(strmHandle.recentProgress)
df=spark.read.json(sc.parallelize([progressJson]))\
 .select('batchId','numInputRows',col('inputRowsperSecond').cast('int'),
  col('processedRowsperSecond').cast('int'),
 'timestamp')\
 .orderBy('timestamp') 
display(df)

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 19, Finished, Available)

SynapseWidget(Synapse.DataFrame, 84b5b4e6-c822-4230-ada3-a4d88de95e7f)

### Ingesting from Event Hub ###

In [1]:
!pip install azure-eventhub

StatementMeta(, e7f2248d-fd59-45ca-abeb-3bf38d466139, 3, Finished, Available)

Collecting azure-eventhub
  Downloading azure_eventhub-5.11.6-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.4/320.4 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: azure-eventhub
Successfully installed azure-eventhub-5.11.6


In [5]:
from azure.eventhub import EventHubConsumerClient

StatementMeta(, 4a99fd23-74df-47eb-81cb-f117dd4c3818, 8, Finished, Available)

In [2]:
connectionString = "Endpoint=XXX"
ehConf = {}
ehConf['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString)


StatementMeta(, e7f2248d-fd59-45ca-abeb-3bf38d466139, 4, Finished, Available)

In [None]:
from notebookutils import mssparkutils
mssparkutils.fs.rm ('Files/checkpoint/stocks',True)

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 30, Finished, Available)

True

In [None]:
import pyspark.sql.functions as f 
from pyspark.sql.types import * 

df = spark.readStream.format("eventhubs")\
  .options(**ehConf)\
  .load()  

rawData = df\
  .withColumn("bodyAsString", f.col("body").cast("string"))\
  .writeStream\
  .format("delta")\
  .option("checkpointLocation", "Files/checkpoint/stocks")\
  .outputMode("append")\
  .option("path", 'Tables/stock_events')


StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 31, Finished, Available)

**Start the stream and validate status**

In [None]:
ehStrm=rawData.start()


StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 33, Finished, Available)

Is the stream active?:True , Stream status:{'message': 'Initializing sources', 'isDataAvailable': False, 'isTriggerActive': False}
Last progress: None


In [None]:
print (f'Is the stream active?:{ehStrm.isActive}',f', Stream status:{ehStrm.status}')
print (f'Last progress: {ehStrm.recentProgress}')

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 45, Finished, Available)

Is the stream active?:True , Stream status:{'message': 'Getting offsets from org.apache.spark.sql.eventhubs.EventHubsSource@5aebb419', 'isDataAvailable': False, 'isTriggerActive': True}
Last progress: [{'id': 'db571257-3512-42af-8b3c-71a7094b70b1', 'runId': 'bc18e507-c30e-4337-a5b4-3759b819922a', 'name': None, 'timestamp': '2024-02-03T23:25:34.221Z', 'batchId': 3, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0, 'durationMs': {'getOffset': 7, 'triggerExecution': 230}, 'stateOperators': [], 'sources': [{'description': 'org.apache.spark.sql.eventhubs.EventHubsSource@5aebb419', 'startOffset': {'fa-eh': {'0': 13268}}, 'endOffset': {'fa-eh': {'0': 13268}}, 'latestOffset': {'fa-eh': {'0': 13268}}, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0}], 'sink': {'description': 'DeltaSink[Tables/stock_events]', 'numOutputRows': -1}}, {'id': 'db571257-3512-42af-8b3c-71a7094b70b1', 'runId': 'bc18e507-c30e-4337-a5b4-3759b819922a', 'name': None, '

**Validate data**

In [None]:
%%sql
-- delete from stock_events;
select * from stock_events

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 46, Finished, Available)

<Spark SQL result set with 1000 rows and 10 fields>

In [100]:
%%sql
select * from stock_events


StatementMeta(, 4a99fd23-74df-47eb-81cb-f117dd4c3818, 106, Finished, Available)

<Spark SQL result set with 1000 rows and 10 fields>

In [None]:
ehStrm.stop()

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 29, Finished, Available)

In [None]:
event_schema = StructType([StructField("Symbol", StringType(), True), 
        StructField("Price",
            StructType([StructField("Currency", StringType(), True),StructField("Value", DoubleType(), True)]),True), 
        StructField("Volume", LongType(), True), 
        StructField("Market_Cap", 
            StructType([StructField("Currency", StringType(), False),StructField("Value", StringType(), True)]), True), 
        StructField("Open", DoubleType(), True),
        StructField("High", DoubleType(), True),
        StructField("Low", DoubleType(), True),
        StructField("Close", DoubleType(), True)
        ]) 

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 47, Finished, Available)

In [None]:
from notebookutils import mssparkutils
mssparkutils.fs.rm ('Files/checkpoint/stocks_brz',True)

In [None]:
df3=spark.table('stock_events')\
   .select(f.from_json("bodyAsString", event_schema).alias("events")).select('events.*')
display(df3)  

StatementMeta(, 3daff2a8-dff1-4d66-9689-ebb4151299b2, 48, Finished, Available)

SynapseWidget(Synapse.DataFrame, 423e5e4e-cff3-4429-ab98-5a4a171b2b8e)

In [101]:
from notebookutils import mssparkutils
mssparkutils.fs.rm ('Files/checkpoint/stocks_brz',True)

StatementMeta(, 4a99fd23-74df-47eb-81cb-f117dd4c3818, 107, Finished, Available)

True

In [104]:
df2=spark.readStream.table('stock_events')\
  .select(f.from_json("bodyAsString", event_schema).alias("events")).select('events.*')

df2.writeStream.format("delta")\
  .option("checkpointLocation", "Files/checkpoint/stocks_brz")\
  .outputMode("append")\
  .toTable("stock_events_brz")

StatementMeta(, 4a99fd23-74df-47eb-81cb-f117dd4c3818, 111, Finished, Available)

<pyspark.sql.streaming.StreamingQuery at 0x7f2163e9a8c0>

In [1]:
%%sql
-- delete from stock_events_brz;
select * from stock_events_brz

StatementMeta(, 8f00f14d-494f-4672-8fdc-eee0a6b131c1, 2, Finished, Available)

<Spark SQL result set with 1000 rows and 8 fields>

In [62]:
csv_schema='_c1 string,_c2 string,_c3 string'
dfSales = spark.readStream\
    .format("csv")\
    .schema(csv_schema)\
    .load('Files/Landing/CSV/Sales') 
# .option('maxFilesPerTrigger',1)\

strm=dfSales.writeStream.format("delta")\
  .option("checkpointLocation", "Files/checkpoint/sales4")\
  .outputMode("append")\
  .toTable("sales_stream") 


StatementMeta(, 7642e7f9-cab6-4ae6-b757-21ce691af731, 64, Finished, Available)

In [63]:
%%sql
select count(*) from sales_stream

StatementMeta(, 7642e7f9-cab6-4ae6-b757-21ce691af731, 65, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>