In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from delta.tables import *
from pyspark.sql.functions import *
from pyspark.sql.functions import sum as _sum

from config import (
    MINIO_ACCESS_KEY,
    MINIO_SECRET_KEY,
    MINIO_SERVER_HOST,
)

In [2]:
conf = SparkConf()
conf.set("spark.hadoop.fs.s3a.impl",
                 "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
conf.set("spark.hadoop.fs.s3a.endpoint", MINIO_SERVER_HOST)
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider',
            'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
conf.set(
    "spark.jars.packages",
    "io.delta:delta-core_2.12:2.0.0,"
    "org.apache.hadoop:hadoop-aws:3.2.3,"
    "com.amazonaws:aws-java-sdk:1.11.375,"
    "com.amazonaws:aws-java-sdk-bundle:1.11.375,"
    "software.amazon.awssdk:url-connection-client:2.15.40",
)
conf.set("spark.sql.extensions",
            "io.delta.sql.DeltaSparkSessionExtension")
conf.set(
    "spark.sql.catalog.spark_catalog",
    "org.apache.spark.sql.delta.catalog.DeltaCatalog",
)
conf.set("spark.databricks.delta.merge.repartitionBeforeWrite.enabled","true")

spark = (
    SparkSession
    .builder
    .config(conf=conf)
    .master("local[*]")
    .getOrCreate()
)

23/05/14 17:13:04 WARN Utils: Your hostname, ducdn-G3-3579 resolves to a loopback address: 127.0.1.1; using 192.168.2.108 instead (on interface wlo1)
23/05/14 17:13:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ducdn/Documents/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ducdn/.ivy2/cache
The jars for the packages stored in: /home/ducdn/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-cd3c38ea-ed41-42e1-8a02-f639acc59b9c;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.apache.hadoop#hadoop-aws;3.2.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found com.amazonaws#aws-java-sdk;1.11.375 in central
	found com.amazonaws#aws-java-sdk-dlm;1.11.375 in central
	found com.amazonaws#aws-java-sdk-core;1.11.375 in central
	fou

In [3]:
base_path = "s3a://datalake/sliver/"

#### Delta table history

#### Get history of table

In [4]:
def get_full_history_table(table_name: str):
    deltaTable = DeltaTable.forPath(spark, base_path + table_name)

    fullHistoryDF = deltaTable.history()    # get the full history of the table

    return fullHistoryDF

df_history = get_full_history_table("cdc.myshop.orders")
df_history.show(truncate=False)

23/05/14 17:14:21 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+-------+-------------------+------+--------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp          |userId|userName|operation|operationParameters                                                                                                                                                                          

In [5]:
df_history.select("version", "timestamp", "operation").show(truncate=False)

+-------+-------------------+---------+
|version|timestamp          |operation|
+-------+-------------------+---------+
|8      |2023-05-14 17:14:23|MERGE    |
|7      |2023-05-14 17:13:24|MERGE    |
|6      |2023-05-14 17:12:23|MERGE    |
|5      |2023-05-14 17:11:27|MERGE    |
|4      |2023-05-14 17:10:56|MERGE    |
|3      |2023-05-14 17:07:24|MERGE    |
|2      |2023-05-14 17:06:26|MERGE    |
|1      |2023-05-14 17:05:58|MERGE    |
|0      |2023-05-14 17:04:43|WRITE    |
+-------+-------------------+---------+



#### Time travel Query with version of table

In [6]:
def time_travel_query_version(table_name: str, version: str):
    df_res = (
        spark.read
        .format("delta")
        .option("versionAsOf", version)
        .load(base_path + table_name)
    )
    return df_res

df = time_travel_query_version("cdc.myshop.orders", "0")
df.show(truncate=False)
df.count()

                                                                                

+---+-------------+---+-------+-----------+---------+----------+---------+----+-----+---+
|op |ts_ms        |id |user_id|payment    |status_id|created_at|before_id|year|month|day|
+---+-------------+---+-------+-----------+---------+----------+---------+----+-----+---+
|c  |1684056264812|14 |9081   |credit_card|1        |1681464144|14       |2023|4    |14 |
|c  |1684056405473|28 |8540   |cash       |1        |1681464144|28       |2023|4    |14 |
|c  |1684058528483|96 |3564   |credit_card|1        |1681466407|96       |2023|4    |14 |
|c  |1684056425578|30 |5709   |credit_card|2        |1683537744|30       |2023|5    |8  |
|c  |1684056927912|80 |7767   |credit_card|3        |1683537744|80       |2023|5    |8  |
|c  |1684056787319|66 |8124   |instalment |3        |1682932944|66       |2023|5    |1  |
|c  |1684056817464|69 |8862   |cash       |3        |1682932944|69       |2023|5    |1  |
|c  |1684056636466|51 |3393   |instalment |2        |1682846544|51       |2023|4    |30 |
|c  |16840

                                                                                

106

#### Time travel query with timestamp

In [None]:
def time_travel_query_timestamp(table_name: str, timestamp: str):
    df_res = (
        spark.read
        .format("delta")
        .option("timestampAsOf", timestamp)
        .load(base_path + table_name)
    )
    return df_res

df = time_travel_query_timestamp("cdc.myshop.orders", "2023-04-16")
df.show(truncate=False)
df.count()