In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable

In [2]:
builder = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \

spark = configure_spark_with_delta_pip(builder).getOrCreate()




In [3]:
sc = spark.sparkContext

Spark DataFrameWriter: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.mode.html#pyspark.sql.DataFrameWriter.mode

Spark to Delta mapping: https://docs.delta.io/latest/releases.html
Dataframe Reader and Writer versions: https://docs.delta.io/latest/versioning.html

In [4]:
path_delta = "D:/Data/delta_test"
path_parquet = "D:/Data/parquet_test"

# Create a table

In [5]:
sdf = spark.range(start=1, end=6, numPartitions=1)
sdf.write.format("parquet").mode("overwrite").save(path_parquet)
sdf.write.format("delta").mode("overwrite").save(path_delta)

In [6]:
spark.read.format("parquet").load(path_parquet).show()
spark.read.format("delta").load(path_delta).show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
+---+

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
+---+



# Append data

In [7]:
sdf = spark.range(start=6, end=11, numPartitions=1)
sdf.write.format("parquet").mode("append").save(path_parquet)
sdf.write.format("delta").mode("append").save(path_delta)

In [8]:
spark.read.format("parquet").load(path_parquet).show()
spark.read.format("delta").load(path_delta).show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
+---+

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
+---+



# Overwrite

In [9]:
sdf = spark.range(start=1, end=8, numPartitions=1)
sdf.write.format("parquet").mode("overwrite").save(path_parquet)
sdf.write.format("delta").mode("overwrite").save(path_delta)

In [10]:
spark.read.format("parquet").load(path_parquet).show()
spark.read.format("delta").load(path_delta).show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
+---+

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
+---+



# Convert Parquet to Delta

In [42]:
DeltaTable.convertToDelta(spark, f"parquet.`{path_parquet}`")

<delta.tables.DeltaTable at 0x255ce8605b0>

In [41]:
spark.read.format("delta").load(path_parquet).show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
+---+



# Read Delta as Parquet

In [11]:
spark.read.format("parquet").load(path_delta).show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
+---+



# Delta details

In [29]:
deltaTable = DeltaTable.forPath(spark, path_delta)
deltaTable.detail().show(20, False)


+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|format|id                                  |name|description|location                |createdAt              |lastModified           |partitionColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+----+-----------+------------------------+-----------------------+-----------------------+----------------+--------+-----------+----------+----------------+----------------+------------------------+
|delta |ee1f1565-df31-4fbd-9533-f7943bfcec4f|NULL|NULL       |file:/D:/Data/delta_test|2024-04-03 22:00:59.482|2024-04-03 22:03:09.383|[]              |1       |522        |{}        |1               |2               |[appendOnly, invariants]|
+------+----------------

# Delta history

In [30]:
deltaTable = DeltaTable.forPath(spark, path_delta)
deltaTable.history().show(20, False)

+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+----------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                   |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                          |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+----------------------------------------------------------+------------+-----------------------------------+
|2      |2024-04-03 22:03:09.383|NULL  |NULL    |WRITE    |{mode -> Overwrite, partitionBy -> []}|NULL|NULL    |NULL     |1          |Serializable  |false        |{numFiles -> 1, numOu

In [33]:
spark.read.format("delta").option("versionAsOf", 2).load(path_delta).show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
+---+

