In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pandas as pd

In [0]:

# Sample data for customers (initial snapshot + updates)
data = [
    (1, "Alice", "Sydney", "2023-01-01T10:00:00"),
    (2, "Bob", "Perth", "2023-02-01T09:00:00"),
    (1, "Alice", "Melbourne", "2023-03-01T12:00:00"),  # Update city for Alice
    (3, "Charlie", "Brisbane", "2023-04-01T14:30:00"), # New customer
]

columns = ["customer_id", "name", "city", "update_timestamp"]

# Create a Pandas DataFrame
pdf = pd.DataFrame(data, columns=columns)

# Convert to Spark DataFrame
df = spark.createDataFrame(pdf)

# Write to Volume as CSV (overwrite for demo)
df.write.mode("overwrite").option("header", "true").csv("/Volumes/sivaadbuc/default/batch18test/customers_raw/")


In [0]:
%sql
show volumes

In [0]:
target_path = "/Volumes/sivaadbuc/default/batch18test/customers_scd2"

In [0]:
# Add SCD2 metadata columns to first snapshot
scd2_df = (df.withColumn("__START_AT", df["update_timestamp"].cast("timestamp"))
             .withColumn("__END_AT", lit(None).cast("timestamp"))
             .withColumn("__IS_CURRENT", lit(True)))

# Write as Delta if not exists
try:
    DeltaTable.forPath(spark, target_path)
    print(f"⚠️ Target table already exists at {target_path}")
except:
    scd2_df.write.format("delta").mode("overwrite").save(target_path)
    print(f"✅ Target SCD2 table created at {target_path}")