In [47]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
from pyspark.sql import functions as f

In [48]:
builder = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \

spark = configure_spark_with_delta_pip(builder).getOrCreate()




In [49]:
sc = spark.sparkContext

Spark DataFrameWriter: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.mode.html#pyspark.sql.DataFrameWriter.mode

Spark to Delta mapping: https://docs.delta.io/latest/releases.html
Dataframe Reader and Writer versions: https://docs.delta.io/latest/versioning.html

Get Started: https://delta.io/learn/getting-started/

Deletes, Updates, Merges: https://docs.delta.io/latest/delta-update.html#language-python

In [50]:
path_delta = "D:/Data/delta_test"

# Create a table and overwrite

In [51]:
sdf = spark.range(start=1, end=6, numPartitions=1)
sdf = sdf.withColumn("name", f.lit("Nikk"))
sdf.write.format("delta").mode("overwrite").save(path_delta)

In [52]:
spark.read.format("delta").load(path_delta).show()
deltaTable = DeltaTable.forPath(spark, path_delta)
deltaTable.detail().show(20, False)
deltaTable.history().show(20, False)

+---+----+
| id|name|
+---+----+
|  1|Nikk|
|  2|Nikk|
|  3|Nikk|
|  4|Nikk|
|  5|Nikk|
+---+----+

+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|format|id                                  |name|description|location                |createdAt              |lastModified           |partitionColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|delta |be104872-c059-4164-9bc4-a0ed0a890f7a|NULL|NULL       |file:/D:/Data/delta_test|2024-05-07 20:16:41.302|2024-05-07 20:16:41.535|[]              |1       |773    

# Append data

In [53]:
sdf = spark.range(start=6, end=11, numPartitions=1)
sdf = sdf.withColumn("name", f.lit("Marko"))
sdf.write.format("delta").mode("append").save(path_delta)

In [54]:
spark.read.format("delta").load(path_delta).show()
deltaTable = DeltaTable.forPath(spark, path_delta)
deltaTable.detail().show(20, False)
deltaTable.history().show(20, False)

+---+-----+
| id| name|
+---+-----+
|  6|Marko|
|  7|Marko|
|  8|Marko|
|  9|Marko|
| 10|Marko|
|  1| Nikk|
|  2| Nikk|
|  3| Nikk|
|  4| Nikk|
|  5| Nikk|
+---+-----+

+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|format|id                                  |name|description|location                |createdAt              |lastModified           |partitionColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|delta |be104872-c059-4164-9bc4-a0ed0a890f7a|NULL|NULL       |file:/D:/Data/delta_test|2024-05-07 2

# Delete Data

In [55]:
deltaTable = DeltaTable.forPath(spark, path_delta)

# Using SQL formatted string
deltaTable.delete("id == 1")

# Using Spark SQL functions
#deltaTable.delete(f.col("id") == 1)

# Using SQL
#spark.sql(f"DELETE FROM delta.`{path_delta}` WHERE id == 1")


In [56]:
spark.read.format("delta").load(path_delta).show()
deltaTable = DeltaTable.forPath(spark, path_delta)
deltaTable.detail().show(20, False)
deltaTable.history().show(20, False)

+---+-----+
| id| name|
+---+-----+
|  6|Marko|
|  7|Marko|
|  8|Marko|
|  9|Marko|
| 10|Marko|
|  2| Nikk|
|  3| Nikk|
|  4| Nikk|
|  5| Nikk|
+---+-----+

+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|format|id                                  |name|description|location                |createdAt              |lastModified           |partitionColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|delta |be104872-c059-4164-9bc4-a0ed0a890f7a|NULL|NULL       |file:/D:/Data/delta_test|2024-05-07 20:16:41.302|

# Update data

In [57]:
deltaTable = DeltaTable.forPath(spark, path_delta)

# Using SQL formatted string
deltaTable.update(condition = "id = 7", set = { "name": "'Strahinja'"})

# Using Spark SQL functions
#deltaTable.update(condition = f.col("id") == 7, set = { "name": f.lit("Strahinja")})

# Using SQL
#spark.sql(f"UPDATE delta.`{path_delta}` SET name = 'Strahinja' WHERE id == 7")

In [58]:
spark.read.format("delta").load(path_delta).show()
deltaTable = DeltaTable.forPath(spark, path_delta)
deltaTable.detail().show(20, False)
deltaTable.history().show(20, False)

+---+---------+
| id|     name|
+---+---------+
|  6|    Marko|
|  7|Strahinja|
|  8|    Marko|
|  9|    Marko|
| 10|    Marko|
|  2|     Nikk|
|  3|     Nikk|
|  4|     Nikk|
|  5|     Nikk|
+---+---------+

+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|format|id                                  |name|description|location                |createdAt              |lastModified           |partitionColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|delta |be104872-c059-4164-9bc4-a0ed0a890f7a|NULL|NULL     