In [22]:
import pathlib
from datetime import datetime
from typing import List, Tuple, Union, Dict

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Column

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [23]:
def get_simple_delete_condition(df: DataFrame, col_name: str) -> str:
    """
    Generate a SQL-like condition string to use in a DELETE statement
    based on distinct values in a DataFrame column.

    Parameters:
        df (DataFrame): The input DataFrame.
        col_name (str): The name of the column in the DataFrame.

    Returns:
        str: A condition string in the format "{col_name} IN (value1, value2, ...)".
    """
    distinct_values = df.select(col_name).distinct().collect()
    condition = (
        f"{col_name} IN ("
        + ", ".join(f"'{row[col_name]}'" for row in distinct_values)
        + ")"
    )
    return condition

In [24]:
def generate_delete_condition(df: DataFrame) -> str:
    """
    Generate a combined SQL-like condition string to use in a DELETE statement
    based on distinct values in multiple columns of a DataFrame.

    If the DataFrame contains only one column, a simple delete condition for that column is returned.

    Parameters:
        df (DataFrame): The input DataFrame.

    Returns:
        str: A combined condition string using AND for multiple columns, or a simple condition string for a single column.

    """
    if len(df.columns) == 1:
        return get_simple_delete_condition(df, df.columns[0])
    else:
        where_condition = get_simple_delete_condition(df, df.columns[0])
        i = 1
        while i < len(df.columns):
            where_condition += " AND " + get_simple_delete_condition(df, df.columns[i])
            i += 1

        return where_condition

In [25]:
# Initialize a SparkSession
spark = SparkSession.builder.appName("DeltaLakeDeleteCondition").getOrCreate()

In [26]:
# Sample data
data = [
    ("value1", "A"),
    ("value1", "A"),
    ("value1", "B"),
    ("value2", "D"),
    ("value2", "E"),
    ("value3", "Z"),
]

# Assuming you have a DataFrame with two columns named 'col1' and 'col2'
df = spark.createDataFrame(data, ["col1", "col2"])

df.show()

+------+----+
|  col1|col2|
+------+----+
|value1|   A|
|value1|   A|
|value1|   B|
|value2|   D|
|value2|   E|
|value3|   Z|
+------+----+



In [27]:
# Generate the WHERE condition
where_condition = generate_delete_condition(df)

# Output the WHERE condition
print(where_condition)

col1 IN ('value1', 'value2', 'value3') AND col2 IN ('A', 'B', 'D', 'E', 'Z')


In [28]:
# Sample data
data_simple = [("value1",), ("value1",), ("value2",), ("value3",)]

# Assuming you have a DataFrame with one column named 'col'
df_simple = spark.createDataFrame(data_simple, ["col"])

df_simple.show()

+------+
|   col|
+------+
|value1|
|value1|
|value2|
|value3|
+------+



In [29]:
where_condition_simple = generate_delete_condition(df_simple)

print(where_condition_simple)

col IN ('value1', 'value2', 'value3')


In [30]:
data = [(1, "A"), (2, "B"), (3, "C"), (1, "A")]
columns = ["id", "letter"]

df_test = spark.createDataFrame(data, columns)

df_test.show()

+---+------+
| id|letter|
+---+------+
|  1|     A|
|  2|     B|
|  3|     C|
|  1|     A|
+---+------+



In [31]:
where_condition_test = generate_delete_condition(df_test)

print(where_condition_test)

id IN ('1', '2', '3') AND letter IN ('A', 'B', 'C')


In [36]:
data_schema = T.StructType(
    [
        T.StructField("id", T.IntegerType(), True),
        T.StructField("posted_dt", T.DateType(), True),
        T.StructField("name", T.StringType(), True),
    ]
)

data = (
    [
        {"id": 1, "posted_dt": "2023-08-01", "name": "John1"},
        {"id": 1, "posted_dt": "2023-08-01", "name": "John2"},
        {"id": 2, "posted_dt": "2023-08-02", "name": "Mary"},
        {"id": 3, "posted_dt": "2023-08-01", "name": "Teresa"},
    ],
)

# Convert the date strings to DateType
for d in data:
    d["posted_dt"] = F.to_date(d["posted_dt"])

initial_df = spark.createDataFrame(data, data_schema)

initial_df.show()

TypeError: list indices must be integers or slices, not str

In [42]:
data_schema = T.StructType(
    [
        T.StructField("id", T.IntegerType(), True),
        T.StructField("posted_dt", T.StringType(), True),
        T.StructField("name", T.StringType(), True),
    ]
)

initial_data = [
    {"id": 1, "posted_dt": "2023-08-01", "name": "John1"},
    {"id": 1, "posted_dt": "2023-08-01", "name": "John2"},
    {"id": 2, "posted_dt": "2023-08-02", "name": "Mary"},
    {"id": 3, "posted_dt": "2023-08-01", "name": "Teresa"},
]


initial_df = spark.createDataFrame(initial_data, data_schema)

initial_df = initial_df.withColumn(
    "posted_dt", F.to_date("posted_dt", format="yyyy-MM-dd")
)

initial_df.show()
initial_df.printSchema()

+---+----------+------+
| id| posted_dt|  name|
+---+----------+------+
|  1|2023-08-01| John1|
|  1|2023-08-01| John2|
|  2|2023-08-02|  Mary|
|  3|2023-08-01|Teresa|
+---+----------+------+

root
 |-- id: integer (nullable = true)
 |-- posted_dt: date (nullable = true)
 |-- name: string (nullable = true)

