# Delta Lake table maintenance
- Delta log analysis 
- Table history
- Vacuum
- Optimizations

In [None]:
df = spark.sql("SELECT * FROM lakehouse_gold.sales LIMIT 10")
display(df)

# Delta log analysis

In [None]:
files_list = mssparkutils.fs.ls("abfss://wwi_03@onelake.dfs.fabric.microsoft.com/lakehouse_gold.Lakehouse/Tables/sales")
display(files_list)

In [None]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,LongType
data=[]
for i in files_list:
    d={}   
    d["path"]=i.path
    d["name"]=i.name
    d["size"]=i.size
    d["modifyTime"]=i.modifyTime
    data.append(d)
schema = StructType([
    StructField("path", StringType(), True),
    StructField("name", StringType(), True),
    StructField("size", LongType(), True),
    StructField("modifyTime", LongType(), True),
])
df = spark.createDataFrame(data, schema=schema)
display(df)

# Table history

In [None]:
%%sql
-- Table history
DESCRIBE HISTORY sales

# Vacuum
Vacuum deletes old files that are no longer referenced by a Delta table log. 
The default file retention threshold is seven days. 
When you run Vacuum with the default retention period, files older than the retention threshold are deleted. 

In [None]:
%%sql
VACUUM sales

In [None]:
%%sql
--https://docs.delta.io/latest/delta-batch.html#data-retention
-- delta.logRetentionDuration: Controls how long the history for a table is kept. The default is interval 30 days. 
-- delta.deletedFileRetentionDuration: Controls how long ago a file must have been deleted before being a candidate for VACUUM. The default is interval 7 days. 
--spark.sql("ALTER TABLE delta sales SET TBLPROPERTIES ('delta.logRetentionDuration'='2 days')")
--spark.sql("ALTER TABLE delta sales SET TBLPROPERTIES ('delta.deletedFileRetentionDuration'='30 days')")

# V-Order
V-Order sorting increases average write time by 15%, but allows up to 50% more compression and optimized data access times.

In [None]:
%%sql
-- You can check if v-order is enabled for your spark session 
SET spark.sql.parquet.vorder.enabled 

# Optimizations
To keep tables in optimum condition for best performance, the OPTIMIZE command merges changes into larger, consolidated Parquet files. 
The OPTIMIZE command can be combined with the Z-ordering command.

In [None]:
%%sql
OPTIMIZE sales ZORDER BY CityKey VORDER; 