## Sample Time Travel

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

HOST_ADDRESS = os.getenv("HOST_ADDRESS")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY")

conf = SparkConf()

conf.setAppName("Sample Time Travel")
conf.set("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000")
conf.set("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
conf.set("spark.hadoop.fs.s3a.path.style.access", True)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
conf.set("hive.metastore.uris", "thrift://metastore:9083")

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

## Add data for dataframe

In [3]:
data2 = [("James", "Smith", "M", 3000),
         ("Michael", "Rose", "M", 6000),
         ("Robert", "Willians", "M", 5500),
         ("Maria", "Anne", "F", 7000)
        ]

## Add schema for dataframe

In [4]:
schema = StructType([
    StructField("firsname", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", StringType(), True)
])

In [5]:
df = spark.createDataFrame(data=data2, schema=schema)

In [6]:
df.show()

+--------+--------+------+------+
|firsname|lastname|gender|salary|
+--------+--------+------+------+
|   James|   Smith|     M|  3000|
| Michael|    Rose|     M|  6000|
|  Robert|Willians|     M|  5500|
|   Maria|    Anne|     F|  7000|
+--------+--------+------+------+



## Send Delta to Minio

In [7]:
df.write.format("delta").mode("append").save('s3a://bronze/tb_time_travel')

## Analyze number version table

In [None]:
from delta.tables import DeltaTable
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .appName("DeltaTableHistory") \
    .getOrCreate()

In [None]:
table_path = 's3a://bronze/adventure_works/bronze_humanresources_department'
delta_table = DeltaTable.forPath(spark, table_path)
history_df = delta_table.history()

In [None]:
history_df.show(truncate=False)

## Read first version

In [None]:
spark.read.format("delta").option("versionAsOf", 0).load(table_path).show()

## Read second version

In [None]:
spark.read.format("delta").option("versionAsOf", 3).load(table_path).show()

## Rollback to version 0

In [None]:
df_version_0 = spark.read.format("delta").option("versionAsOf", 3).load(table_path)
df_version_0.write.format("delta").mode("overwrite").save(table_path)

## Read table in actual version

In [None]:
df = spark.read.format("delta").load('s3a://bronze/tb_time_travel').show()