# PySpark Append and Overwrite in Delta Lake

In [62]:
import pyspark
from delta import *

In [63]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [64]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Append and Overwrite with Parquet

In [65]:
columns = ["singer", "country"]

In [66]:
data1 = [("feid", "colombia")]
rdd1 = spark.sparkContext.parallelize(data1)
df1 = rdd1.toDF(columns)

In [67]:
df1.repartition(1).write.format("parquet").save("tmp/singers1")

In [68]:
!tree tmp/singers1

[01;34mtmp/singers1[0m
├── [00m_SUCCESS[0m
└── [00mpart-00000-ffcc616b-4009-462a-a60d-9e2bd7130083-c000.snappy.parquet[0m

0 directories, 2 files


In [69]:
data2 = [("annita", "brasil")]
rdd2 = spark.sparkContext.parallelize(data2)
df2 = rdd2.toDF(columns)

In [70]:
df2.repartition(1).write.mode("append").format("parquet").save("tmp/singers1")

In [71]:
spark.read.format("parquet").load("tmp/singers1").show()

+------+--------+
|singer| country|
+------+--------+
|annita|  brasil|
|  feid|colombia|
+------+--------+



In [72]:
!tree tmp/singers1

[01;34mtmp/singers1[0m
├── [00m_SUCCESS[0m
├── [00mpart-00000-49da366f-fd15-481b-a3a4-8b3bd26ef2c7-c000.snappy.parquet[0m
└── [00mpart-00000-ffcc616b-4009-462a-a60d-9e2bd7130083-c000.snappy.parquet[0m

0 directories, 3 files


In [73]:
data3 = [("rihanna", "barbados")]
rdd3 = spark.sparkContext.parallelize(data3)
df3 = rdd3.toDF(columns)

In [74]:
df3.repartition(1).write.mode("overwrite").format("parquet").save("tmp/singers1")

In [75]:
spark.read.format("parquet").load("tmp/singers1").show()

+-------+--------+
| singer| country|
+-------+--------+
|rihanna|barbados|
+-------+--------+



In [76]:
!tree tmp/singers1

[01;34mtmp/singers1[0m
├── [00m_SUCCESS[0m
└── [00mpart-00000-63531918-401d-4983-8848-7b99fff39713-c000.snappy.parquet[0m

0 directories, 2 files


## Append and Overwrite with Delta Lake

In [77]:
df1.repartition(1).write.format("delta").save("tmp/singers2")

                                                                                

In [78]:
!tree tmp/singers2

[01;34mtmp/singers2[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
└── [00mpart-00000-946ae20f-fa5a-4e92-b1c9-49322594609a-c000.snappy.parquet[0m

1 directory, 2 files


In [79]:
spark.read.format("delta").load("tmp/singers2").show()

+------+--------+
|singer| country|
+------+--------+
|  feid|colombia|
+------+--------+



In [80]:
df2.repartition(1).write.mode("append").format("delta").save("tmp/singers2")

In [81]:
!tree tmp/singers2

[01;34mtmp/singers2[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
├── [00mpart-00000-946ae20f-fa5a-4e92-b1c9-49322594609a-c000.snappy.parquet[0m
└── [00mpart-00000-adda870a-83a2-4f5c-82a0-c6ecc60d9d2e-c000.snappy.parquet[0m

1 directory, 4 files


In [82]:
spark.read.format("delta").load("tmp/singers2").show()

+------+--------+
|singer| country|
+------+--------+
|annita|  brasil|
|  feid|colombia|
+------+--------+



In [83]:
df3.repartition(1).write.mode("overwrite").format("delta").save("tmp/singers2")

In [84]:
!tree tmp/singers2

[01;34mtmp/singers2[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   └── [00m00000000000000000002.json[0m
├── [00mpart-00000-2d176e2d-66e0-44b6-8922-6bc3a15a6b96-c000.snappy.parquet[0m
├── [00mpart-00000-946ae20f-fa5a-4e92-b1c9-49322594609a-c000.snappy.parquet[0m
└── [00mpart-00000-adda870a-83a2-4f5c-82a0-c6ecc60d9d2e-c000.snappy.parquet[0m

1 directory, 6 files


In [85]:
spark.read.format("delta").load("tmp/singers2").show()

+-------+--------+
| singer| country|
+-------+--------+
|rihanna|barbados|
+-------+--------+



## PySpark error / errorifexists save mode

In [87]:
df1.repartition(1).write.mode("error").format("delta").save("tmp/singers3")

                                                                                

In [88]:
df2.repartition(1).write.mode("error").format("delta").save("tmp/singers3")

AnalysisException: Cannot write to already existent path file:/Users/matthew.powers/Documents/code/my_apps/delta-examples/notebooks/pyspark/tmp/singers3 without setting OVERWRITE = 'true'.

## PySpark ignore save mode

In [89]:
df1.repartition(1).write.mode("ignore").format("delta").save("tmp/singers4")

                                                                                

In [90]:
spark.read.format("delta").load("tmp/singers4").show()

+------+--------+
|singer| country|
+------+--------+
|  feid|colombia|
+------+--------+



In [91]:
df2.repartition(1).write.mode("ignore").format("delta").save("tmp/singers4")

                                                                                

In [93]:
df2.show()

+------+-------+
|singer|country|
+------+-------+
|annita| brasil|
+------+-------+



In [92]:
spark.read.format("delta").load("tmp/singers4").show()

+------+--------+
|singer| country|
+------+--------+
|  feid|colombia|
+------+--------+



In [94]:
!tree tmp/singers4

[01;34mtmp/singers4[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
└── [00mpart-00000-e992ef60-1c11-423a-820c-50397e2f9ab1-c000.snappy.parquet[0m

1 directory, 3 files


In [96]:
!cat tmp/singers4/_delta_log/00000000000000000001.json

{"commitInfo":{"timestamp":1664801261318,"operation":"WRITE","operationParameters":{"mode":"Ignore","partitionBy":"[]"},"readVersion":0,"isolationLevel":"SnapshotIsolation","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.3.0 Delta-Lake/2.1.0","txnId":"dcb6992b-24cb-4f3e-bcd3-203837c7986e"}}


## Cleanup

In [86]:
!rm -rf tmp