In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.types import BooleanType

In [0]:
@dlt.table
def incoming_employees():
    data = [
        (1, 'Alice', 'HR'),              # No change
        (2, 'Bob', 'Marketing'),         # Changed
        (3, 'Charlie', 'Finance'),       # Additional data
    ]
    columns = ['emp_id', 'name', 'department']
    return spark.createDataFrame(data, columns)
#here is the incoming/updated data 

In [0]:

@dlt.table
def employee_dim():
    schema = StructType([
        StructField("emp_id", IntegerType(), False),
        StructField("name", StringType(), True),
        StructField("department", StringType(), True),
        StructField("start_date", StringType(), True),
        StructField("end_date", StringType(), True),
        StructField("is_current", BooleanType(), True)
    ])

    data = [
        (1, 'Alice', 'HR', '2023-01-01', None, True),
        (2, 'Bob', 'Finance', '2023-01-01', None, True),
        (3, 'Charlie', 'IT', '2023-01-01', None, True)
    ]

    return spark.createDataFrame(data, schema=schema)


In [0]:
@dlt.table
def changed_employees():
    incoming = dlt.read("incoming_employees")
    current = dlt.read("employee_dim").filter(col("is_current") == True)

    joined = incoming.alias("incoming").join(
        current.alias("current"),
        col("incoming.emp_id") == col("current.emp_id"),
        "left"
    )

    changed = joined.filter(
        col("current.emp_id").isNull() |
        (col("incoming.name") != col("current.name")) |
        (col("incoming.department") != col("current.department"))
    ).select("incoming.*")

    return changed


In [0]:
@dlt.table
def expired_records():
    changed = dlt.read("changed_employees")
    current = dlt.read("employee_dim").filter("is_current = true")

    expired = changed.join(current, "emp_id").select(
        current["emp_id"],
        current["name"],
        current["department"],
        current["start_date"],
        current_date().alias("end_date"),
        lit(False).alias("is_current")
    )
    return expired


In [0]:
@dlt.table
def new_records():
    changed = dlt.read("changed_employees")
    return changed.withColumn("start_date", current_date()) \
                  .withColumn("end_date", lit(None).cast("date")) \
                  .withColumn("is_current", lit(True))

In [0]:
@dlt.table
def employee_dim_scd2():
    current = dlt.read("employee_dim")
    expired = dlt.read("expired_records")
    new = dlt.read("new_records")

    unchanged = current.filter("is_current = true").join(
        dlt.read("changed_employees").select("emp_id"),
        "emp_id", "left_anti"
    )

    return expired.unionByName(new).unionByName(unchanged)
