# Chapter 3
## Essential Operations

This is a secondary notebook dedicated mostly to using the DeltaTable API.

This depends on previously running the first chapter 3 notebook.

In [1]:
# Reduce logging
sc.setLogLevel("ERROR")

In [2]:
# Create a DeltaTable object to interact with
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "countries.delta")
delta_table.toDF()


DataFrame[id: bigint, country: string, capital: string]

In [3]:
# View the table history
delta_table.history().show()

+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|   operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      9|2024-05-29 12:57:...|  NULL|    NULL|       MERGE|{predicate -> ["(...|NULL|    NULL|     NULL|          8|  Serializable|        false|{numTargetRowsCop...|        NULL|Apache-Spark/3.5....|
|      8|2024-05-29 12:55:...|  NULL|    NULL|      DELETE|{predicate -> ["(...|NULL|    NULL|     NULL|          7|  Serializable|        false|{numRemovedFiles ...|        NULL|Apache-Spark/3.5.

In [4]:
# With a specific timestamp you can restore time specific versions
# delta_table.restoreToTimestamp('2024-05-24').show()

In [5]:
# You can also restore by version number (easier in this case since we just made the tables)
delta_table.restoreToVersion(1)

                                                                                

DataFrame[table_size_after_restore: bigint, num_of_files_after_restore: bigint, num_removed_files: bigint, num_restored_files: bigint, removed_files_size: bigint, restored_files_size: bigint]

In [6]:
# With Pyspark you can use time travel to read from the table, update the value accordingly
spark.read.option("timestampAsOf", "2024-05-29 12:31:00").load("countries.delta").count()

9

In [7]:
# Pyspark time travel by version number
spark.read.option("versionAsOf", "1").load("countries.delta").select("id").distinct().show()

+---+
| id|
+---+
|  4|
|  1|
|  2|
+---+



In [8]:
# Delteing values with Pyspark functions
from pyspark.sql.functions import col

delta_table.delete(col("id") == 1)
df = delta_table.toDF()
df.show()

+---+-------+-------+
| id|country|capital|
+---+-------+-------+
|  2| Canada|Toronto|
+---+-------+-------+



In [9]:
# permissions error? works correctly though
(
spark
.createDataFrame(
    [
    (1, 'India', 'New Delhi'),
    (4, 'Australia', 'Canberra')
    ],
    schema=["id", "country", "capital"]
    )
.write
.format("delta")
.mode("overwrite")
.save("exampleDB.delta")
)

                                                                                

In [10]:
# not working right with the derby metastore but if you need to replace with the DeltaTable API this is how it is done
# delta_table2 = (
#     DeltaTable.replace(spark)
#     .tableName("countries.delta")
#     .addColumns(df.schema)
#     .execute()
#     )

In [11]:
# Using the mergeBuilder to upsert values
idf = (
    spark
    .createDataFrame([
        (1, 'India', 'New Delhi'),
        (4, 'Australia', 'Canberra')],
        schema=["id", "country", "capital"]
        )
    )

delta_table.alias("target").merge(
    source = idf.alias("source"),
    condition = "source.id = target.id"
  ).whenMatchedUpdate(set =
    {
      "country": "source.country",
      "capital": "source.capital"
    }
  ).whenNotMatchedInsert(values =
    {
      "id": "source.id",
      "country": "source.country",
      "capital": "source.capital"
    }
  ).execute()

In [12]:
# Check the final results
delta_table.toDF().show()

+---+---------+---------+
| id|  country|  capital|
+---+---------+---------+
|  4|Australia| Canberra|
|  1|    India|New Delhi|
|  2|   Canada|  Toronto|
+---+---------+---------+

