# Delta Lake Time Travel

In [1]:
import delta
import pyspark
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-210/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a3d5ac44-640c-4a59-8ad8-923f7468bebd;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 362ms :: artifacts dl 24ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

23/02/03 14:39:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Create Delta table

In [4]:
df = spark.range(0, 3)

In [5]:
df.repartition(1).write.format("delta").save("tmp/some_nums")

                                                                                

In [6]:
df = spark.range(8, 11)

In [7]:
df.repartition(1).write.mode("append").format("delta").save("tmp/some_nums")

                                                                                

In [8]:
df = spark.createDataFrame([(55,), (66,), (77,)]).toDF("id")

In [9]:
df.repartition(1).write.mode("overwrite").format("delta").save("tmp/some_nums")

                                                                                

## Read different versions of the data

In [10]:
spark.read.format("delta").load("tmp/some_nums").show()

+---+
| id|
+---+
| 55|
| 66|
| 77|
+---+



In [11]:
spark.read.format("delta").option("versionAsOf", "0").load("tmp/some_nums").show()

                                                                                

+---+
| id|
+---+
|  0|
|  1|
|  2|
+---+



In [12]:
spark.read.format("delta").option("versionAsOf", "1").load("tmp/some_nums").show()

                                                                                

+---+
| id|
+---+
|  8|
|  9|
| 10|
|  0|
|  1|
|  2|
+---+



In [13]:
spark.read.format("delta").option("versionAsOf", "2").load("tmp/some_nums").show()

+---+
| id|
+---+
| 55|
| 66|
| 77|
+---+



In [14]:
from delta.tables import DeltaTable

In [15]:
delta_table = DeltaTable.forPath(spark, "tmp/some_nums")

In [16]:
delta_table.history().select("version", "timestamp", "operation").show(truncate=False)

+-------+-----------------------+---------+
|version|timestamp              |operation|
+-------+-----------------------+---------+
|2      |2023-02-03 14:40:25.46 |WRITE    |
|1      |2023-02-03 14:40:18.877|WRITE    |
|0      |2023-02-03 14:40:05.617|WRITE    |
+-------+-----------------------+---------+

