**Prepare the data:**
1. Upload CSV file into unmanaged section
2. Read into dataframe and write into table



In [13]:
df = spark.read.format("csv").option("header","true").load("Files/Landing/CSV/SalesOrderHeader.csv")
df.write.format('delta').saveAsTable('SalesOrderHeader')

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 15, Finished, Available)

Read from Delta table and display

In [14]:
dfso=spark.table('SalesOrderHeader')
display(dfso)

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 16, Finished, Available)

SynapseWidget(Synapse.DataFrame, 5c4452c6-474c-43cb-a872-a296a013e482)

# Delta Lake Time Travel

Applying Update and Merge Into transformation

In [16]:
from delta.tables import *
import  pyspark.sql.functions as F

def updateSales (pCustomerID):
  dfdel=dfso.filter(f'CustomerID={pCustomerID}')\
    .withColumn('Comment',F.lit('Order delayed'))\
    .withColumn('ModifiedDate',F.current_timestamp()).alias('source')

  dlttarget=DeltaTable.forPath(spark,'Tables/salesorderheader').alias('target')
  dlttarget.merge(dfdel,'source.SalesOrderID=target.SalesOrderID')\
    .whenMatchedUpdateAll()\
    .whenNotMatchedInsertAll()\
    .execute()

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 18, Finished, Available)

Apply two update/merge transformations

In [17]:
updateSales(29485)
updateSales(30113)

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 19, Finished, Available)

**Browsing change history**

In [18]:
display(spark.sql('DESCRIBE HISTORY SalesOrderHeader'))

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 20, Finished, Available)

SynapseWidget(Synapse.DataFrame, e15392f0-d596-40ed-8fff-d98674834734)

Reading current value

In [19]:
display(spark.read.format("delta").load('Tables/salesorderheader').filter('CustomerID=29485'))

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 21, Finished, Available)

SynapseWidget(Synapse.DataFrame, 64b07270-6904-43d6-b94f-c80ced24c201)

In [20]:
df=spark.read \
  .format("delta")\
  .option("versionAsOf", "0") \
  .load('Tables/salesorderheader')\
  .filter('CustomerID=29485')
display(df)


StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 22, Finished, Available)

SynapseWidget(Synapse.DataFrame, 06c7a502-c2c4-439a-9a0a-266ab51df94c)

In [9]:
display(spark.read \
  .format("delta") \
  .option("timestampAsOf", "2024-04-07 23:35:55.638") \
  .load('Tables/salesorderheader')\
  .filter('CustomerID=29485'))

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 11, Finished, Available)

SynapseWidget(Synapse.DataFrame, 116ff0e0-69e7-4b96-8f7c-0ce251885d8a)

**Restoring to previous version**

In [21]:
spark.sql('RESTORE TABLE salesorderheader TO VERSION AS OF 0')
display(spark.read.format("delta").load('Tables/salesorderheader').filter('CustomerID=29485'))

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 23, Finished, Available)

SynapseWidget(Synapse.DataFrame, e045c605-5cad-41c2-bec4-5135cc691032)

**Using time travel to capture ETL logging**

In [22]:
df=spark.sql('DESCRIBE HISTORY SalesOrderHeader')\
    .selectExpr('Timestamp','Operation','operationMetrics.numOutputRows'\
    ,'operationMetrics.numTargetRowsInserted'\
    ,'operationMetrics.numTargetRowsUpdated'\
    ,'operationMetrics.numTargetRowsDeleted')
display(df)    

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 24, Finished, Available)

SynapseWidget(Synapse.DataFrame, 7e764bc9-5a79-4315-ba34-7d95b2e71dd7)

**Exploring Vacuum command**

Removing above 168 hours history

In [24]:
%%sql
VACUUM salesorderheader RETAIN 178 HOURS 

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 26, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

Removing below 168 hours history

In [27]:
spark.sql('VACUUM salesorderheader RETAIN 1 HOURS') 

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 29, Finished, Available)

IllegalArgumentException: requirement failed: Are you sure you would like to vacuum files with such a low retention period? If you have
writers that are currently writing to this table, there is a risk that you may corrupt the
state of your Delta table.

If you are certain that there are no operations being performed on this table, such as
insert/upsert/delete/optimize, then you may turn off this check by setting:
spark.databricks.delta.retentionDurationCheck.enabled = false

If you are not sure, please use a value not less than "168 hours".
       

In [28]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
spark.sql('VACUUM salesorderheader RETAIN 1 HOURS') 

StatementMeta(, 70ba7a89-cdb4-4348-b980-e0d3c30970ab, 30, Finished, Available)

DataFrame[path: string]

In [None]:
display(spark.sql('DESCRIBE HISTORY SalesOrderHeader'))

StatementMeta(, , , Waiting, )

SynapseWidget(Synapse.DataFrame, 092738b3-7a70-4e8c-8010-52f81f675927)

# Delta Lake Change Data feed

**Important- change Spark version in the default environment settings to 1.2 and above**

**Creating  Delta Table with change data feed**

In [None]:
-- %%sql
-- CREATE TABLE test 
-- (c1 int, c2 int)
-- USING DELTA
-- TBLPROPERTIES (delta.enableChangeDataFeed = true);

**Applying change data feed to existing table**

In [4]:
%%sql
ALTER TABLE SalesOrderHeader SET TBLPROPERTIES (delta.enableChangeDataFeed = true);

StatementMeta(, f60811bd-8f71-43ec-931f-0b399a719451, 6, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [5]:
%%sql
UPDATE SalesOrderHeader SET Comment='Delivered' WHERE SalesOrderID=71774;
DELETE FROM SalesOrderHeader WHERE SalesOrderID=71780;
INSERT INTO SalesOrderHeader  SELECT * FROM SalesOrderHeader WHERE SalesOrderID=71831;

StatementMeta(, , -1, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 0 rows and 0 fields>

In [6]:
display(spark.sql('DESCRIBE HISTORY SalesOrderHeader'))

StatementMeta(, f60811bd-8f71-43ec-931f-0b399a719451, 10, Finished, Available)

SynapseWidget(Synapse.DataFrame, 597c3a30-502f-47c7-bb1f-bc259fbde14d)

In [7]:
dfc=spark.read.format("delta")\
    .option("readChangeFeed", "true")\
    .option("startingVersion", 4) \
    .table("SalesOrderHeader")
display(dfc)

StatementMeta(, f60811bd-8f71-43ec-931f-0b399a719451, 11, Finished, Available)

SynapseWidget(Synapse.DataFrame, af19e97f-32f8-440e-b10d-b895de85b550)

**Streaming change feed to Delta Lake destination**

In [8]:
tableName='SalesStream'
deltaTablePath='Tables/'+tableName
strm=spark.readStream.format("delta")\
    .option("readChangeFeed", "true")\
    .option("startingVersion", 4) \
    .table("SalesOrderHeader")\
    .filter("_change_type='update_postimage'")\
    .writeStream\
    .queryName('SalesStream')\
    .outputMode("append")\
    .format("delta")\
    .option("path", deltaTablePath)\
    .option("checkpointLocation", "Files/Checkpoints/"+tableName)
strmHandle=strm.start()    

StatementMeta(, f60811bd-8f71-43ec-931f-0b399a719451, 12, Finished, Available)

In [None]:
strmHandle.status

StatementMeta(, , , Waiting, )

{'message': 'Getting offsets from DeltaSource[abfss://46df290c-a4bf-4f21-ae5b-b2a4f313e3d8@onelake.dfs.fabric.microsoft.com/07a9d45a-7cef-4ec8-aa73-60477164a94a/Tables/salesorderheader]',
 'isDataAvailable': False,
 'isTriggerActive': True}

In [None]:
# strmHandle.stop()

StatementMeta(, , , Waiting, )

In [None]:
updateSales(29929)

StatementMeta(, , , Waiting, )

In [None]:
%%sql
select * from SalesStream

StatementMeta(, , , Waiting, )

<Spark SQL result set with 8 rows and 25 fields>

**SQL Syntax to pull changes**

In [11]:
%%sql
SELECT * FROM table_changes('salesorderheader',4);
SELECT * FROM table_changes_by_path('Tables/salesorderheader',4)

StatementMeta(, , -1, Finished, Available)

<Spark SQL result set with 4 rows and 25 fields>

<Spark SQL result set with 4 rows and 25 fields>