In [0]:
from delta.tables import DeltaTable


In [0]:
# data load
delta_path = "/Volumes/workspace/ecommerce/ecommerce_data/delta/events_octnov"
delta_table = DeltaTable.forPath(spark, delta_path)


In [0]:
df_incremental = spark.createDataFrame([
        ("2019-11-06T08:10:35.000+00:00", "view", 1304849,2053013558920217191,'computers.notebook','acer',2312.13,531136086,'8ba7b208-cf77-4d2d-b4c0-195b635d1853'),   # new event to insert
        ("2019-11-01T00:00:02.000+00:00", "view", 1004258,2053013555631882655,'electronics.smartphone','apple',732.07,532647354,'d2d3d2c6-631d-489e-9fb5-06f340b85be0')          # existing event â†’ should update or skip
    ],
    ["event_time", "event_type", "product_id","category_id","category_code","brand","price","user_id","user_session"]   
)

#MERGE into the Delta table

merge_result = delta_table.alias("t").merge(
        df_incremental.alias("s"),
        "t.event_time = s.event_time AND t.user_session = s.user_session AND t.event_type = s.event_type"
    ).whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]


In [0]:
merge_result.show()

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|                2|               1|               0|                1|
+-----------------+----------------+----------------+-----------------+



In [0]:
from pyspark.sql.functions import col

result_df = spark.read.format("delta").load(delta_path) \
    .filter(col("user_session") == "8ba7b208-cf77-4d2d-b4c0-195b635d1853") \
    .orderBy(col("event_time").desc())

display(result_df)


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-08T08:17:28.000Z,view,1307340,2053013558920217191,computers.notebook,asus,360.34,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T08:16:16.000Z,view,1307340,2053013558920217191,computers.notebook,asus,360.34,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T08:14:03.000Z,view,1304849,2053013558920217191,computers.notebook,acer,2312.13,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T08:12:45.000Z,view,1304849,2053013558920217191,computers.notebook,acer,2312.13,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T08:07:22.000Z,view,1306175,2053013558920217191,computers.notebook,acer,474.14,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T08:07:04.000Z,cart,1306175,2053013558920217191,computers.notebook,acer,474.14,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T08:07:01.000Z,view,1306175,2053013558920217191,computers.notebook,acer,474.14,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T08:06:15.000Z,view,1306175,2053013558920217191,computers.notebook,acer,474.14,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T07:55:19.000Z,view,1307004,2053013558920217191,computers.notebook,lenovo,290.6,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853
2019-11-08T07:53:43.000Z,view,1307067,2053013558920217191,computers.notebook,lenovo,261.55,531136086,8ba7b208-cf77-4d2d-b4c0-195b635d1853


In [0]:
delta_table.history().show(50, truncate=False)

+-------+-------------------+--------------+--------------------------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+------------------------+-----------+-----------------+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# timetravel gives the first version
df_old = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
df_old.show()

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-11-15 13:39:42|      view|   6200987|2053013552293216471|appliances.enviro...|    NULL| 188.91|529566948|79c74259-bd2d-4c4...|
|2019-11-15 13:39:42|      view|  12720600|2053013553559896355|                NULL| joyroad|   38.1|560797597|959ed83a-58f3-415...|
|2019-11-15 13:39:42|      view|   2702420|2053013563911439225|appliances.kitche...|dauscher| 270.25|554869281|e44552eb-e03f-4f1...|
|2019-11-15 13:39:42|      view|  18300526|2053013558945383017|     accessories.bag|   trust|  28.29|513395415|a3090ecc-c02d-4e7...|
|2019-11-15 13:39:42|      view|   1201504|2172371436436455782|  elec

In [0]:
df_old.count()
df_new = spark.read.format("delta").load(delta_path)
df_new.count()

109950744

In [0]:
#optimize - compact many small files into few large files improve read performance. 
spark.sql(f"OPTIMIZE delta.`{delta_path}`")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
# zorder organizes the data inside the files so that rows with similar values of event_time are stored close together
 
spark.sql(f"OPTIMIZE delta.`{delta_path}` ZORDER BY (event_time)")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
# check delta history to see the optimize and zorder operations
#from delta.tables import DeltaTable
DeltaTable.forPath(spark, delta_path).history().show(truncate=False)

+-------+-------------------+--------------+--------------------------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+------------------------+-----------+-----------------+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
#  clean up old physical files 

spark.sql(f"VACUUM delta.`{delta_path}`")

DataFrame[path: string]