# Delta Lake Delete Rows

In [3]:
import delta
import pyspark.sql.functions as F
import pyspark

In [4]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [5]:
spark = delta.configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-87baa461-ea9e-418c-87b8-0f992e7e47f2;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 109ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default    

## Create Delta Table

In [23]:
df = spark.createDataFrame([("bob", 3), ("sue", 5)]).toDF("first_name", "age")

In [24]:
df.repartition(1).write.format("delta").save("tmp/sunny-table")

                                                                                

In [25]:
!tree tmp/sunny-table

[01;34mtmp/sunny-table[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
└── [00mpart-00000-bbea1957-356a-4f89-b544-62e687ec0766-c000.snappy.parquet[0m

1 directory, 2 files


In [26]:
df = spark.createDataFrame([("ingrid", 58), ("luisa", 87)]).toDF("first_name", "age")

In [27]:
df.repartition(1).write.mode("append").format("delta").save("tmp/sunny-table")

                                                                                

In [28]:
!tree tmp/sunny-table

[01;34mtmp/sunny-table[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
├── [00mpart-00000-bbea1957-356a-4f89-b544-62e687ec0766-c000.snappy.parquet[0m
└── [00mpart-00000-f529dc4b-b649-45d6-9b9a-40dc1b2eb5a0-c000.snappy.parquet[0m

1 directory, 4 files


In [29]:
spark.read.format("delta").load("tmp/sunny-table").show()

+----------+---+
|first_name|age|
+----------+---+
|    ingrid| 58|
|     luisa| 87|
|       bob|  3|
|       sue|  5|
+----------+---+



## Delete rows from table

In [30]:
dt = delta.DeltaTable.forPath(spark, "tmp/sunny-table")

In [31]:
dt.delete(F.col("age") > 75)

                                                                                

In [32]:
spark.read.format("delta").load("tmp/sunny-table").show()

+----------+---+
|first_name|age|
+----------+---+
|    ingrid| 58|
|       bob|  3|
|       sue|  5|
+----------+---+



In [33]:
!tree tmp/sunny-table

[01;34mtmp/sunny-table[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   └── [00m00000000000000000002.json[0m
├── [00mpart-00000-38a10c12-1b5f-4dc0-a4bc-0aacd9a71f0a-c000.snappy.parquet[0m
├── [00mpart-00000-bbea1957-356a-4f89-b544-62e687ec0766-c000.snappy.parquet[0m
└── [00mpart-00000-f529dc4b-b649-45d6-9b9a-40dc1b2eb5a0-c000.snappy.parquet[0m

1 directory, 6 files


In [35]:
!jq . tmp/sunny-table/_delta_log/00000000000000000002.json

[1;39m{
  [0m[34;1m"remove"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"part-00000-f529dc4b-b649-45d6-9b9a-40dc1b2eb5a0-c000.snappy.parquet"[0m[1;39m,
    [0m[34;1m"deletionTimestamp"[0m[1;39m: [0m[0;39m1669903430340[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;39mtrue[0m[1;39m,
    [0m[34;1m"extendedFileMetadata"[0m[1;39m: [0m[0;39mtrue[0m[1;39m,
    [0m[34;1m"partitionValues"[0m[1;39m: [0m[1;39m{}[0m[1;39m,
    [0m[34;1m"size"[0m[1;39m: [0m[0;39m746[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"add"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"part-00000-38a10c12-1b5f-4dc0-a4bc-0aacd9a71f0a-c000.snappy.parquet"[0m[1;39m,
    [0m[34;1m"partitionValues"[0m[1;39m: [0m[1;39m{}[0m[1;39m,
    [0m[34;1m"size"[0m[1;39m: [0m[0;39m747[0m[1;39m,
    [0m[34;1m"modificationTime"[0m[1;39m: [0m[0;39m1669903430331[0m[1;39m,
    [0m[34;1m"dataChange

## Cleanup

In [19]:
!rm -rf tmp