# Chapter 3
## Essential Operations

This is a basic how-to guide following the contents of Chapter 3 of *Delta Lake: The Definitive Guide*

Consider it a quick field guide to running all your CRUD operations.

In [1]:
# Reduce logging
sc.setLogLevel("ERROR")

In [2]:
# Create a database for the examples
spark.sql("create database if not exists exampleDB")

DataFrame[]

In [3]:
# Create some data
data = [
(1, "United Kingdom", "London"),
(2, "Canada", "Toronto")
]

# Create a schema
schema = ["id", "country", "capital"]

# Create a dataframe
sdf = spark.createDataFrame(data, schema=schema)

# Save as a csv to use later
(sdf
 .toPandas()
 .to_csv("countries.csv", header=None)
)

# Save as a paruet to use later
sdf.write.format("parquet").mode("overwrite").save("countries.parquet")

                                                                                

In [4]:
# Make sure the table doesn't already exist
spark.sql("""
DROP TABLE if exists exampleDB.countries
""")

DataFrame[]

In [5]:
# Create an empty table
spark.sql("""
CREATE TABLE exampleDB.countries (
  id LONG,
  country STRING,
  capital STRING
) USING DELTA;
""")

                                                                                

DataFrame[]

In [6]:
!pwd

/opt/spark/work-dir


In [7]:
# Create a table in the working directory so it is inspectable
spark.sql("""
CREATE TABLE exampleDB.countries_loc (
  id LONG,
  country STRING,
  capital STRING
) USING DELTA
LOCATION '/opt/spark/work-dir/countries.delta';
""")

DataFrame[]

In [8]:
# Basic INSERT INTO example
spark.sql("""
INSERT INTO exampleDB.countries VALUES
(1, 'United Kingdom', 'London'),
(2, 'Canada', 'Toronto')
""")

DataFrame[]

In [9]:
# INSERT INTO from a parquet file 
spark.sql("""
INSERT INTO exampleDB.countries
SELECT * FROM parquet.`countries.parquet`;
""")

DataFrame[]

In [11]:
# Simple CTAS example
spark.sql("""
CREATE TABLE exampleDB.countries2 as
select * from exampleDB.countries
""")

DataFrame[]

In [12]:
# CONVERT TO DELTA example
spark.sql("""
CONVERT TO DELTA parquet.`countries.parquet`
""")

DataFrame[]

#### Check out the _delta_log and view the contents of the JSON file(s)

In [12]:
!tree countries.delta/_delta_log

[01;34mcountries.delta/_delta_log[0m
└── [00m00000000000000000000.json[0m

0 directories, 1 file


In [13]:
import pandas as pd

spark.read.json("countries.delta/_delta_log/*").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
|commitInfo                                                                                                                                         |metaData                                                                                                                                                                                                                                                                                      |protocol|
+-----------------------------------------------------------------------------------------------------------

In [14]:
# Check the current contents of the table
spark.table("exampleDB.countries").show()

+---+--------------+-------+
| id|       country|capital|
+---+--------------+-------+
|  1|United Kingdom| London|
|  1|United Kingdom| London|
|  2|        Canada|Toronto|
|  2|        Canada|Toronto|
+---+--------------+-------+



In [15]:
# Reading from the table with SQL
spark.sql("""
SELECT
    id,
    capital
FROM
    exampleDB.countries
""")

DataFrame[id: bigint, capital: string]

In [16]:
# Same select statement but with Pyspark
(spark.table("exampleDB.countries")
 .select("id", "capital")
)

DataFrame[id: bigint, capital: string]

In [17]:
# Time travel based on version
spark.sql("""
SELECT DISTINCT id FROM exampleDB.countries VERSION AS OF 1
""").show()

+---+
| id|
+---+
|  1|
|  2|
+---+



In [20]:
# Time travel based on timestamp, you have to be "in bounds" so you would need to adjust to a valid timestamp for this
# It cannot predate or postdate the available versions of the table so we commented out te time tr
spark.sql("""
SELECT count(1) FROM exampleDB.countries -- |TIMESTAMP AS OF "2024-04-20"
""").show()

+--------+
|count(1)|
+--------+
|       4|
+--------+



In [24]:
# Overwrite with values from another source
spark.sql("""
INSERT OVERWRITE TABLE exampleDB.countries_loc
select distinct * from exampleDB.countries
""")

DataFrame[]

In [25]:
# Update specific values
spark.sql("""
UPDATE exampleDB.countries
SET country = 'U.K.'
WHERE id = 1;
""")

DataFrame[num_affected_rows: bigint]

In [26]:
# Update values that don't exist (safe but does nothing, see later upsert example for alternative)
spark.sql("""
UPDATE exampleDB.countries
SET country = 'U.S.', capital = 'Washington, D.C.'
WHERE id = 3;
""")

DataFrame[num_affected_rows: bigint]

In [27]:
# DELETE without specifying filters, deletes everything
spark.sql("""
DELETE FROM exampleDB.countries
""")

DataFrame[num_affected_rows: bigint]

In [29]:
# Show everything was deleted
spark.sql("""
select * from exampleDB.countries
""").show()

+---+-------+-------+
| id|country|capital|
+---+-------+-------+
+---+-------+-------+



In [30]:
# INSERT OVERWRITE, if there were any values already they would get deleted too
spark.sql("""
INSERT OVERWRITE exampleDB.countries
VALUES (3, 'U.S.', 'Washington, D.C.');
""")

DataFrame[]

In [31]:
# Delete with a filtering condition
spark.sql("""
DELETE FROM exampleDB.countries where id < '2024-04-20'
""")

DataFrame[num_affected_rows: bigint]

In [32]:
# Show results
spark.sql("""
select * from exampleDB.countries
""").show()

+---+-------+----------------+
| id|country|         capital|
+---+-------+----------------+
|  3|   U.S.|Washington, D.C.|
+---+-------+----------------+



#### Upsert
Insert records into the table if they don't exist or update them to correspond if they do.

In [33]:
spark.sql("""
MERGE INTO exampleDB.countries A
USING (select * from parquet.`countries.parquet`) B
ON A.id = B.id
WHEN MATCHED THEN
  UPDATE SET
	id = A.id,
	country = B.country,
	capital = B.capital
WHEN NOT MATCHED
  THEN INSERT (
	id,
	country,
	capital
  )
  VALUES (
	B.id,
	B.country,
	B.capital
  )

""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [34]:
# Show results
spark.sql("""
select * from exampleDB.countries
""").show()

+---+--------------+----------------+
| id|       country|         capital|
+---+--------------+----------------+
|  1|United Kingdom|          London|
|  3|          U.S.|Washington, D.C.|
|  2|        Canada|         Toronto|
+---+--------------+----------------+

