# Preparing the data:
1. Upload Parquet file to unmanaged section
2. Read into dataframe and write into table



In [102]:
df = spark.read.format("parquet").load("Files/Landing/PARQUET/Stocks/")
df.write.format('delta').mode('overwrite').saveAsTable('Stocks')

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 124, Finished, Available)

**Explore the data**

In [103]:
%%sql
select * from Stocks where symbol='NSFT' order by symbol,time,seq limit 100

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 125, Finished, Available)

<Spark SQL result set with 100 rows and 19 fields>

**Create and populate lookup table**

In [104]:
%%sql
DROP TABLE IF EXISTS ExchangeRates;

CREATE TABLE IF NOT EXISTS ExchangeRates
(Currency STRING,Rate FLOAT)
USING DELTA;

INSERT INTO ExchangeRates VALUES('USD',1.35)

StatementMeta(, , -1, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

Reading lookup table

In [105]:
dfr=spark.table('ExchangeRates')
display(dfr)

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 129, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1dcfbb5d-56db-4c76-a255-4247d5cdb7df)

**Applying sample transformations and writing historical data into destination table**

In [106]:
spark.read.format("delta")\
    .table("Stocks")\
    .join(dfr,'Currency')\
    .selectExpr('symbol','bidPrice','time','seq','Currency','Round(bidPrice*Rate,2) as bidPrice_CAD')\
    .write\
    .format('delta')\
    .mode('overwrite')\
    .saveAsTable('StocksProcessed')
display(spark.table('StocksProcessed').orderBy('symbol','time','seq'))    

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 130, Finished, Available)

SynapseWidget(Synapse.DataFrame, a3d13e29-3d0f-4586-bad7-8f521c7ce9af)

# Applying Change Data Feed
**Check out this video to learn more about Change Data Feed and Time Travel:** https://youtu.be/XGVvEYor14g

In [107]:
%%sql
ALTER TABLE Stocks SET TBLPROPERTIES (delta.enableChangeDataFeed = true);

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 131, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

**Get the min table version that CDF is valid from**

In [108]:
%%sql
DESCRIBE HISTORY Stocks

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 132, Finished, Available)

<Spark SQL result set with 2 rows and 15 fields>

In [109]:
cdfStVersion=1

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 133, Finished, Available)

In [110]:
%%sql
UPDATE Stocks Set bidPrice=bidPrice*1.1 WHERE Symbol='NSFT';
DELETE FROM Stocks WHERE symbol='NSFT' and seq=2164;
INSERT INTO Stocks SELECT `time`,symbol,sector,securityType,bidPrice,bidSize,askPrice,
 askSize,lastUpdated,lastSalePrice,lastSaleSize,lastSaleTime,volume,marketPercent,seq+100,
 EventProcessedUtcTime,PartitionId,EventEnqueuedUtcTime,Currency  FROM Stocks WHERE Symbol='NSFT' LIMIT 10

StatementMeta(, , -1, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 0 rows and 0 fields>

**Reviewing CDF in batch format**

In [111]:
import pyspark.sql.functions as F

dfc=spark.read.format("delta")\
    .option("readChangeFeed", "true")\
    .option("startingVersion", cdfStVersion) \
    .table("Stocks")\
    .orderBy(F.col("_commit_version").desc())
display(dfc)

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 137, Finished, Available)

SynapseWidget(Synapse.DataFrame, d0c5aa06-8ce8-41d1-b18b-9ae41b12c9d2)

Read from Delta table and display

In [112]:
tableName='StocksProcessed'
deltaTablePath='Tables/'+tableName

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 138, Finished, Available)

**Optional: run this command to clean checkpoint folder only when repeatedly running the stream** 

In [113]:
from notebookutils import mssparkutils
try:
    mssparkutils.fs.rm (f'Files/Checkpoints/{tableName}',True)
except:
    print ("Path doesn't exist, no further action required")

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 139, Finished, Available)

**De-duplicate transaction versions- keep most recent transaction**

In [114]:
from pyspark.sql.window import Window
dfs=spark.read.format("delta")\
    .option("readChangeFeed", "true")\
    .option("startingVersion", cdfStVersion) \
    .table("Stocks")\
    .filter("_change_type !='update_preimage'")\
    .join(dfr,'Currency')\
    .withColumn('rowNum',F.row_number()\
    .over(Window.partitionBy('symbol','time','seq')\
    .orderBy(F.col("_commit_version").desc())))\
    .selectExpr('symbol','bidPrice','time','seq','Currency',\
    'Round(bidPrice*Rate,2) as bidPrice_CAD','_change_type','_commit_version')\
    .filter('rowNum=1')\
    .alias('source')
display(dfs)    

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 140, Finished, Available)

SynapseWidget(Synapse.DataFrame, e6606503-972b-4ea1-b02c-e922b8260204)

## Streaming CDF by Spark Structured Streaming
**Check out these tutorials to learn more about Spark Stuctured Streaming:** 
https://youtu.be/kg_UvdXgH80, 
https://youtu.be/Dp3FhnMVhiY

**ForEachBatch processing function**

In [115]:
from delta.tables import *
from pyspark.sql.window import Window

dlttarget=DeltaTable.forPath(spark,'Tables/stocksprocessed').alias('target')
def mergeBatch(df,batchID):
    df=df.withColumn('rowNum',F.row_number()\
        .over(Window.partitionBy('symbol','time','seq').orderBy(F.col("_commit_version").desc())))\
        .filter('rowNum=1')
    dlttarget.merge(df,'source.symbol=target.symbol and source.time=target.time and source.seq=target.seq')\
        .whenMatchedDelete(condition = "source.`_change_type`='delete'")\
        .whenMatchedUpdateAll(condition = "source.`_change_type`='update_postimage'")\
        .whenNotMatchedInsertAll()\
        .execute()
    pass

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 141, Finished, Available)

**Streaming changes with forEachBatch destination**

In [116]:
import pyspark.sql.functions as F
strm=spark.readStream\
    .format("delta")\
    .option("readChangeFeed", "true")\
    .option("startingVersion", cdfStVersion) \
    .table("Stocks")\
    .filter("_change_type !='update_preimage'")\
    .join(dfr,'Currency')\
    .selectExpr('symbol','bidPrice','time','seq','Currency',\
    'Round(bidPrice*Rate,2) as bidPrice_CAD','_change_type','_commit_version')\
    .alias('source')\
    .writeStream\
        .queryName(tableName)\
        .format("delta")\
        .foreachBatch(mergeBatch)\
        .option("checkpointLocation", f"Files/Checkpoints/{tableName}")
strmHandle=strm.start()

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 142, Finished, Available)

In [117]:
strmHandle.status

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 143, Finished, Available)

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [118]:
%%sql
UPDATE Stocks Set bidPrice=bidPrice*1.1 WHERE Symbol='NSFT' 

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 144, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

In [119]:
%%sql 
SELECT symbol,time,seq,bidPrice,Currency FROM Stocks WHERE Symbol='NSFT' AND seq=1105 Order By time,seq LIMIT 100

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 145, Finished, Available)

<Spark SQL result set with 3 rows and 5 fields>

In [120]:
%%sql
SELECT symbol,time,seq,bidPrice,Currency,bidPrice_CAD FROM StocksProcessed WHERE Symbol='NSFT' 
    AND seq=1105 Order By time,seq LIMIT 100

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 146, Finished, Available)

<Spark SQL result set with 3 rows and 6 fields>

**Stop the stream**

In [121]:
strmHandle.stop()

StatementMeta(, 6fc46c8a-2aa2-4b6b-9ce7-26af6d07c6a6, 147, Finished, Available)