## Sample Time Travel

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

conf = SparkConf()

conf.setAppName("Sample Time Travel")
conf.set("spark.hadoop.fs.s3a.endpoint", "http://172.21.121.140:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "chapolin")
conf.set("spark.hadoop.fs.s3a.secret.key", "mudar@123")
conf.set("spark.hadoop.fs.s3a.path.style.access", True)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
conf.set("hive.metastore.uris", "thrift://metastore:9083")

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

## Add data for dataframe

In [2]:
data2 = [("James", "Smith", "M", 3000),
         ("Michael", "Rose", "M", 6000),
         ("Robert", "Willians", "M", 5500),
         ("Maria", "Anne", "F", 7000)
        ]

## Add schema for dataframe

In [3]:
schema = StructType([
    StructField("firsname", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", StringType(), True)
])

In [4]:
df = spark.createDataFrame(data=data2, schema=schema)

In [5]:
df.show()

+--------+--------+------+------+
|firsname|lastname|gender|salary|
+--------+--------+------+------+
|   James|   Smith|     M|  3000|
| Michael|    Rose|     M|  6000|
|  Robert|Willians|     M|  5500|
|   Maria|    Anne|     F|  7000|
+--------+--------+------+------+



## Send Delta to Minio

In [14]:
df.write.format("delta").mode("append").save('s3a://bronze/tb_time_travel')

## Analyze number version table

In [7]:
from delta.tables import DeltaTable
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder \
    .appName("DeltaTableHistory") \
    .getOrCreate()

In [23]:
table_path = 's3a://bronze/tb_time_travel'
delta_table = DeltaTable.forPath(spark, table_path)
history_df = delta_table.history()

In [24]:
history_df.show(truncate=False)

+-------+-------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------+------------+-----------------------------------+
|version|timestamp          |userId|userName|operation|operationParameters                   |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                           |userMetadata|engineInfo                         |
+-------+-------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------+------------+-----------------------------------+
|2      |2024-08-15 21:25:56|null  |null    |WRITE    |{mode -> Overwrite, partitionBy -> []}|null|null    |null     |1          |Serializable  |false        |{numFiles -> 4, numOutputRows -> 4

## Read first version

In [17]:
spark.read.format("delta").option("versionAsOf", 0).load(table_path).show()

+--------+--------+------+------+
|firsname|lastname|gender|salary|
+--------+--------+------+------+
|  Robert|Willians|     M|  5500|
| Michael|    Rose|     M|  6000|
|   James|   Smith|     M|  3000|
|   Maria|    Anne|     F|  7000|
+--------+--------+------+------+



## Read second version

In [28]:
spark.read.format("delta").option("versionAsOf", 1).load(table_path).show()

+--------+--------+------+------+
|firsname|lastname|gender|salary|
+--------+--------+------+------+
|  Robert|Willians|     M|  5500|
|  Robert|Willians|     M|  5500|
| Michael|    Rose|     M|  6000|
| Michael|    Rose|     M|  6000|
|   James|   Smith|     M|  3000|
|   James|   Smith|     M|  3000|
|   Maria|    Anne|     F|  7000|
|   Maria|    Anne|     F|  7000|
+--------+--------+------+------+



## Rollback to version 0

In [21]:
df_version_0 = spark.read.format("delta").option("versionAsOf", 0).load(table_path)
df_version_0.write.format("delta").mode("overwrite").save(table_path)

## Read table in actual version

In [22]:
df = spark.read.format("delta").load('s3a://bronze/tb_time_travel').show()

+--------+--------+------+------+
|firsname|lastname|gender|salary|
+--------+--------+------+------+
|  Robert|Willians|     M|  5500|
| Michael|    Rose|     M|  6000|
|   James|   Smith|     M|  3000|
|   Maria|    Anne|     F|  7000|
+--------+--------+------+------+

