# Quickstart: Delta Lake



In [2]:
from delta import *
from pyspark.sql import SparkSession

builder = (
    SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# Create a Delta table
#data = spark.range(3, 8)
#data.write.format("delta").save("/tmp/delta-table3")

# Read data from the Delta table
df = spark.read.format("delta").load("/tmp/delta-table3")
df.show()

24/12/08 18:37:58 WARN Utils: Your hostname, Steves-Mac-mini.local resolves to a loopback address: 127.0.0.1; using 10.0.0.21 instead (on interface en1)
24/12/08 18:37:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/slin/.ivy2/cache
The jars for the packages stored in: /Users/slin/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-39c5a79c-f5c4-4ab8-bc55-4a092f73ebea;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central


:: loading settings :: url = jar:file:/opt/homebrew/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


:: resolution report :: resolve 90ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-39c5a79c-f5c4-4ab8-bc55-4a092f73ebea
	confs: [default]
	0 artifacts copied, 3 already retrieved (0kB/3ms)
24/12/08 18:37:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log l

+---+
| id|
+---+
|  3|
|  7|
|  5|
|  4|
|  6|
+---+



## Creating a Delta Lake Table

-- SQL
CREATE TABLE exampleDB.countries (
 id LONG,
 country STRING,
 capital STRING
) USING DELTA;

In [7]:
# Python
from pyspark.sql.types import *
from delta.tables import *

delta_table = (
    DeltaTable
    #.create(spark)
    .createIfNotExists(spark)
    .tableName("countries")
    .addColumn("id", dataType=LongType(), nullable=False)
    .addColumn("country", dataType=StringType(), nullable=False)
    .addColumn("capital", dataType=StringType(), nullable=False)
    .execute()
)

INSERT INTO

-- SQL
INSERT INTO countries VALUES
(1, 'United Kingdom', 'London'),
(2, 'Canada', 'Toronto')
With PySpark DataFrame syntax, you just need to specify that inserting records into
a  specific  table  is  the  destination  of  a  write  operation  with  insertInto  (note  that
columns are aligned positionally, so column names will be ignored with this method):

In [8]:
data = [
    (1, "United Kingdom", "London"),
    (2, "Canada", "Toronto")
    ]
schema = ["id", "country", "capital"]
df = spark.createDataFrame(data, schema=schema)
(
df
.write
.format("delta")
.insertInto("countries")
)

                                                                                

Append

In [9]:
data = [(3, "United States", "Washington, D.C.")]
# Define the schema for the Delta table
schema = ["id", "country", "capital"]

# Create a DataFrame from the sample data and schema
df = spark.createDataFrame(data, schema=schema)
# Write the DataFrame to a Delta table in append mode
# (if the table doesn't exist, it will be created)
(df.write.format("delta").mode("append").saveAsTable("countries"))

CREATE TABLE AS SELECT

## Querying Data

a  high-level
understanding of how partition filtering works (which is explored much more deeply
in Chapters 5 and 10) and how the transaction log allows querying views of the data
from previous versions with time travel

In [11]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forName(spark, "countries")
df=delta_table.toDF()
df.show()

+---+--------------+----------------+
| id|       country|         capital|
+---+--------------+----------------+
|  3| United States|Washington, D.C.|
|  1|United Kingdom|          London|
|  2|        Canada|         Toronto|
+---+--------------+----------------+



Filter Query
-- SQL
SELECT * FROM exampleDB.countries
WHERE capital = "London"

In [14]:
filtered_df=df.filter(df.capital == "London")
filtered_df.show()

+---+--------------+-------+
| id|       country|capital|
+---+--------------+-------+
|  1|United Kingdom| London|
+---+--------------+-------+



-- SQL
SELECT
    id,
    capital
FROM
    countries

In [16]:
select_df=df.select("id", "capital")
select_df.show()

+---+----------------+
| id|         capital|
+---+----------------+
|  3|Washington, D.C.|
|  1|          London|
|  2|         Toronto|
+---+----------------+



-- SQL
SELECT DISTINCT id FROM countries VERSION AS OF 1

In [20]:
read_df = (
    spark.read.option("versionAsOf", "1")
    .load("countries")
    .select("id")
    .distinct()
)
read_df.show()

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/Volumes/InvestAI/investai/investai-binder/notebook/countries.

In [None]:
#-- SQL
#SELECT count(1) FROM exampleDB.countries TIMESTAMP AS OF "2024-04-20"
# Python
(
spark
.read
.option("timestampAsOf", "2024-04-20")
.load("countries.delta")
.count()
)

Update

-- SQL
UPDATE countries
SET { country = 'U.K.' }
WHERE id = 1;

In [24]:
delta_table.update(condition="id = 1", set={"country": "'U.K'"})
df.show()

+---+-------------+----------------+
| id|      country|         capital|
+---+-------------+----------------+
|  3|United States|Washington, D.C.|
|  2|       Canada|         Toronto|
|  1|          U.K|          London|
+---+-------------+----------------+



Delete

-- SQL
DELETE FROM countries
WHERE id = 1;

In [25]:
from pyspark.sql.functions import col

delta_table.delete("id = 1")  # uses SQL expression
delta_table.delete(col("id") == 2)  # uses PySpark expression

df.show()

+---+-------------+----------------+
| id|      country|         capital|
+---+-------------+----------------+
|  3|United States|Washington, D.C.|
+---+-------------+----------------+



Overwrite

In [26]:
(
    spark.createDataFrame(
        [(1, "India", "New Delhi"), (4, "Australia", "Canberra")],
        schema=["id", "country", "capital"],
    )
    .write.format("delta")
    .mode("overwrite")  # specify the output mode
    .saveAsTable("countries")
)

df.show()

+---+---------+---------+
| id|  country|  capital|
+---+---------+---------+
|  4|Australia| Canberra|
|  1|    India|New Delhi|
+---+---------+---------+



Merge

-- SQL
MERGE INTO countries A
USING (select * from parquet.`countries.parquet`) B
ON A.id = B.id
WHEN MATCHED THEN
  UPDATE SET
    id = A.id,
    country = B.country,
    capital = B.capital
WHEN NOT MATCHED
  THEN INSERT (
    id,
    country,
    capital
  )
  VALUES (
    B.id,
    B.country,
    B.capital
  )

In [28]:
idf = (
    spark
    .createDataFrame([
        (1, 'India', 'New Delhi'),
        (4, 'Australia', 'Canberra')],
        schema=["id", "country", "capital"]
        )
    )
delta_table.alias("target").merge(
    source=idf.alias("source"), condition="source.id = target.id"
).whenMatchedUpdate(
    set={"country": "source.country", "capital": "source.capital"}
).whenNotMatchedInsert(
    values={"id": "source.id", "country": "source.country", "capital": "source.capital"}
).execute()

df.show()

+---+---------+---------+
| id|  country|  capital|
+---+---------+---------+
|  1|    India|New Delhi|
|  4|Australia| Canberra|
+---+---------+---------+



## Conversion

-- SQL
CONVERT TO DELTA parquet.`countries.parquet`

In [31]:
from delta.tables import DeltaTable

'''delta_table = (
    DeltaTable
    .convertToDelta(
        spark, 
        "parquet.`countries.parquet`"
        )
    )'''
delta_table.detail()
delta_table.history()
# df.show()

DataFrame[version: bigint, timestamp: timestamp, userId: string, userName: string, operation: string, operationParameters: map<string,string>, job: struct<jobId:string,jobName:string,jobRunId:string,runId:string,jobOwnerId:string,triggerType:string>, notebook: struct<notebookId:string>, clusterId: string, readVersion: bigint, isolationLevel: string, isBlindAppend: boolean, operationMetrics: map<string,string>, userMetadata: string, engineInfo: string]