## Quickstart

Code snippets are from [this guide](https://docs.delta.io/latest/quick-start.html).

In [1]:
import pyspark
from delta import *

In [2]:
builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

22/05/30 12:31:49 WARN Utils: Your hostname, Matthews-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.2 instead (on interface en0)
22/05/30 12:31:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/powers/.sdkman/candidates/spark/3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/powers/.ivy2/cache
The jars for the packages stored in: /Users/powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4c6d32a1-bd27-4039-ad75-47fac8cfbd51;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.2.1 in central
	found io.delta#delta-storage;1.2.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-core_2.12/1.2.1/delta-core_2.12-1.2.1.jar ...
	[SUCCESSFUL ] io.delta#delta-core_2.12;1.2.1!delta-core_2.12.jar (486ms)
downloading https://repo1.maven.org/maven2/io/delta/delta-storage/1.2.1/delta-storage-1.2.1.jar ...
	[SUCCESSFUL ] io.delta#delta-storage;1.2.1!delta-storage.jar (69ms)
downloading https://repo1.maven.org/maven2/org/antlr/antlr4-runtime/4.8/antlr4-runtime-4.8.jar ...
	[SUCCESSFUL ] org.antlr#antlr4-runtime;4.8!antlr4-runtime.jar

## Create a table

In [4]:
data = spark.range(0, 5)
data.write.format("delta").save("tmp/delta-table")

                                                                                

## Read a table

In [5]:
df = spark.read.format("delta").load("tmp/delta-table")

In [7]:
df.show()

+---+
| id|
+---+
|  3|
|  0|
|  2|
|  1|
|  4|
+---+



## Update table - overwrite

In [8]:
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save("tmp/delta-table")

                                                                                

In [9]:
df = spark.read.format("delta").load("tmp/delta-table")
df.show()

+---+
| id|
+---+
|  7|
|  5|
|  9|
|  6|
|  8|
+---+



## Conditional update without overwrite

In [11]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "tmp/delta-table")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

                                                                                

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [12]:
deltaTable.toDF().show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



## Read older versions of data using time travel

In [14]:
df = spark.read.format("delta").option("versionAsOf", 0).load("tmp/delta-table")
df.show()

                                                                                

+---+
| id|
+---+
|  3|
|  0|
|  2|
|  1|
|  4|
+---+

