# PySpark / Ibis / Delta Lake

In [1]:
import ibis
import pyspark
from delta import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-332-delta-230/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c336f34c-d466-4ef0-969d-9ea7a4819c54;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.3.0 in central
	found io.delta#delta-storage;2.3.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 103ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-core_2.12;2.3.0 from central in [default]
	io.delta#delta-storage;2.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |  

23/05/06 19:30:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Create a Delta table

In [4]:
df = spark.createDataFrame([(0, "Bob", 75), (1, "Sue", 25), (2, "Jim", 27)]).toDF(
    "id", "name", "age"
)

In [6]:
df.show()

+---+----+---+
| id|name|age|
+---+----+---+
|  0| Bob| 75|
|  1| Sue| 25|
|  2| Jim| 27|
+---+----+---+



In [5]:
df.write.format("delta").save("tmp/fun_people")

                                                                                

23/05/06 19:30:20 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

## Append to the Delta table

In [7]:
df = spark.createDataFrame([(8, "Larry", 19), (9, "Jerry", 69)]).toDF(
    "id", "name", "age"
)

In [8]:
df.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  8|Larry| 19|
|  9|Jerry| 69|
+---+-----+---+



In [9]:
df.write.format("delta").mode("append").save("tmp/fun_people")

## Create an Ibis table from the Delta table

In [14]:
spark.read.format("delta").load("tmp/fun_people").orderBy("id").show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|  Bob| 75|
|  1|  Sue| 25|
|  2|  Jim| 27|
|  8|Larry| 19|
|  9|Jerry| 69|
+---+-----+---+



In [15]:
spark.read.format("delta").load("tmp/fun_people").createOrReplaceTempView("fun_people")

In [16]:
con = ibis.pyspark.connect(spark)

In [17]:
table = con.table("fun_people")

In [18]:
table.filter(table.age >= 50).execute()

Unnamed: 0,id,name,age
0,9,Jerry,69
1,0,Bob,75


In [19]:
table.filter(table.age >= 50).execute().to_clipboard()

In [14]:
table.head(2).execute()

Unnamed: 0,id,name,age
0,9,Jarry,69
1,1,Sue,25


In [20]:
table.head(2).execute().to_clipboard()

## Time travel back to version 0 of the data

In [29]:
spark.read.format("delta").option("versionAsOf", "0").load(
    "tmp/fun_people"
).createOrReplaceTempView("fun_people_v0")

In [30]:
table_v0 = con.table("fun_people_v0")

In [31]:
table_v0.filter(table_v0.age >= 50).execute()

Unnamed: 0,id,name,age
0,0,Bob,75


In [32]:
table_v0.filter(table_v0.age >= 50).execute().to_clipboard()

In [33]:
table_v0.head(2).execute()

Unnamed: 0,id,name,age
0,2,Jim,27
1,0,Bob,75


In [34]:
table_v0.head(2).execute().to_clipboard()