# MiniBench: Total time DV vs. CoW as % of file deleted

This example will run the DELETE operation w/ Deletion Vectors vs. traditional CoW across two dimensions: number of files touched, and % of file deleted.

In [None]:
from delta.tables import *
from pyspark.sql.functions import lit, expr, col
from dataclasses import dataclass
import time

In [None]:
t1 = "/tmp/tables/t10_1000000"
t2 = "/tmp/tables/t10_1000000_nondv"

In [None]:
def write_num_files_parquet(path, num_files, num_rows):
    for i in range(1, num_files + 1):
        (
            spark.range(0, num_rows, 1, 1)
            .withColumn("data", expr("uuid()"))
            .withColumn("file", lit(i))
            .write.format("parquet")
            .mode("append")
            .save(path)
        )


# returns time taken to (delete, read)
def delete_and_read(dt, num_files, num_rows):
    start_time = time.time_ns()
    dt.delete((col("id") < num_rows) & (col("file") <= int(num_files)))
    time_to_delete = time.time_ns() - start_time

    start_time = time.time_ns()
    dt.toDF().selectExpr("sum(id)").collect()
    time_to_read = time.time_ns() - start_time

    return (time_to_delete, time_to_read)

In [None]:
num_files = 10
num_rows = 1000000

In [None]:
write_num_files_parquet(t1, num_files, num_rows)
write_num_files_parquet(t2, num_files, num_rows)

In [None]:
dt1 = DeltaTable.convertToDelta(spark, "parquet.`%s`" % t1)
dt2 = DeltaTable.convertToDelta(spark, "parquet.`%s`" % t2)

In [None]:
# enable Deletion Vectors table feature
spark.sql(
    "ALTER TABLE delta.`%s` SET TBLPROPERTIES ('delta.enableDeletionVectors' = true)"
    % t1
)

In [None]:
@dataclass
class TestResult:
    mode: str
    percent_file_deleted: int
    num_files_touched: int
    run: str
    time_to_delete: float
    time_to_read: float

In [None]:
percent_of_file_range = list(range(1, 31, 2))  # 1,3,..29
num_file_range = list(range(1, 11))  # 1,2,..10
amount_of_iterations_to_smooth = 3
results = []

In [None]:
dt1.history().select("version", "operation").show()
dt2.history().select("version", "operation").show()

In [None]:
for num_files in num_file_range:
    for percent in percent_of_file_range:
        # for each of 2 dimensions (number of files touched, percent of file deleted)
        for run in range(0, amount_of_iterations_to_smooth):
            # for this number of iterations (to smooth out noise)

            # delete and read using DV enabled table and append to results
            (ted, ter) = delete_and_read(dt1, num_files, (percent * 0.01) * num_rows)
            tr = TestResult(
                mode="MoR",
                percent_file_deleted=percent,
                num_files_touched=num_files,
                run=run,
                time_to_delete=ted,
                time_to_read=ter,
            )
            results.append(tr)
            print(tr)

            # delete and read using non DV enabled table and append to results
            (ted, ter) = delete_and_read(dt2, num_files, (percent * 0.01) * num_rows)
            tr = TestResult(
                mode="CoW",
                percent_file_deleted=percent,
                num_files_touched=num_files,
                run=run,
                time_to_delete=ted,
                time_to_read=ter,
            )
            results.append(tr)
            print(tr)

            dt1.restoreToVersion(1)  # restore to setting DV feature flag (DV table)
            dt2.restoreToVersion(0)  # restore to convert (non DV table)