# Delta Lake Operations on Spark 3

Check the spark version

In [2]:
print("Spark version:", str(sc.version))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Spark version: 3.0.1-amzn-0

# Setup Delta Lake Engine Layer

__The following configure on spark is very important and it will enforce spark engine to use delta lake storage layer.__

In [6]:
%%configure -f
{
    "conf":  { 
             "spark.jars":"s3://chen115y-jar-deltalake/delta-core_2.12-0.7.0.jar",
             "spark.sql.extensions":"io.delta.sql.DeltaSparkSessionExtension",
             "spark.sql.catalog.spark_catalog":"org.apache.spark.sql.delta.catalog.DeltaCatalog"
           } 
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
10,application_1611346943813_0014,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
10,application_1611346943813_0014,pyspark,idle,Link,Link,✔


In [7]:
sc.addPyFile("s3://chen115y-jar-deltalake/delta-core_2.12-0.7.0.jar")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Import Delta Lake Library

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, lit
from delta.tables import DeltaTable

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Delta Lake Operations - Table Save

In [6]:
data = spark.range(0, 5)
data.write.format("delta").save("s3a://chen115y-test/deltalake-target-data/data-extract-EMR")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Delta Lake Operations - Table Read

In [9]:
df = spark.read.format("delta").load("s3a://chen115y-test/deltalake-target-data/data-extract-EMR")
df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+
| id|
+---+
| 11|
|  3|
|  4|
|  2|
+---+

# Delta Lake Operations - Table Linkage

In [10]:
deltaTable = DeltaTable.forPath(spark, "s3a://chen115y-test/deltalake-target-data/data-extract-EMR")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Delta Lake Operations - Table Update

In [11]:
deltaTable.update("id = 1", { "id": "11" } )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Delta Lake Operations - Table Merge

In [12]:
deltaTable.alias("t").merge(
    df.alias("s"),
    "s.id = t.id") \
  .whenMatchedDelete(condition = "s.id = '0'") \
  .whenMatchedUpdate(set = {"id" : "s.id"
                           } \
  ).execute()

deltaTable.generate("symlink_format_manifest")

df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+
| id|
+---+
|  3|
|  4|
|  2|
| 11|
+---+

# Delta Lake Operations - Table Time Travel

In [13]:
df1 = spark.read.format('delta').option('versionAsOf',0).load('s3a://chen115y-test/deltalake-target-data/data-extract-EMR')
df1.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+
| id|
+---+
|  1|
|  4|
|  0|
|  3|
|  2|
+---+

In [14]:
df2 = spark.read.format('delta').option('versionAsOf',1).load('s3a://chen115y-test/deltalake-target-data/data-extract-EMR')
df2.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+
| id|
+---+
|  4|
| 11|
|  0|
|  3|
|  2|
+---+