In [0]:
# transform_silver_autoloader.py
from pyspark.sql import functions as F
from pyspark.sql.window import Window

SILVER_DB = "silver"
QUARANTINE_TABLE = f"{SILVER_DB}.quarantine"
VALID_FERT_TYPES = ["Heavy Metal", "Organic", "Inorganic", "Dry", "Toxic"]

spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {QUARANTINE_TABLE} (
    source_table STRING,
    pk_value STRING,
    fk_values STRING,
    full_row STRING,
    failure_reason STRING
) USING DELTA
""")

def write_to_quarantine(df, source_table, pk_col, fk_expr=None, failure_reason="invalid"):
    if fk_expr is None:
        df = df.withColumn("fk_values", F.lit(None).cast("string"))
    else:
        df = df.withColumn("fk_values", fk_expr)
        
    df_to_write = df.select(
        F.lit(source_table).alias("source_table"),
        F.col(pk_col).cast("string").alias("pk_value"),
        F.col("fk_values"),
        F.struct("*").cast("string").alias("full_row"),
        F.lit(failure_reason).alias("failure_reason")
    )
    
    df_to_write.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(QUARANTINE_TABLE)

# Load bronze tables
bronze_consumer = spark.table("bronze.consumer")
bronze_purchase = spark.table("bronze.purchase")
bronze_avocado = spark.table("bronze.avocado")
bronze_fertilizer = spark.table("bronze.fertilizer")

# -----------------------
# Consumer
# -----------------------
consumer = (
    bronze_consumer
    .withColumn("consumer_id", F.col("consumerid").cast("long"))
    .withColumn("sex", F.initcap(F.col("Sex")))
    .withColumn("age", F.col("Age").cast("int"))
)

consumer_malformed = consumer.filter(
    F.col("consumer_id").isNull() | (F.col("age") < 0) | (F.col("age") > 130)
)
consumer_valid = consumer.filter(
    (F.col("consumer_id").isNotNull()) & (F.col("age").between(0,130))
)

# Upsert into silver
(
    consumer_valid.write.format("delta")
    .mode("overwrite")  # can change to merge logic if needed for incremental
    .option("mergeSchema", "true")
    .saveAsTable(f"{SILVER_DB}.validated_consumer")
)
if consumer_malformed.limit(1).count() > 0:
    write_to_quarantine(consumer_malformed, "consumer", "consumer_id", failure_reason="invalid consumer")

# -----------------------
# Purchase
# -----------------------
purchase = (
    bronze_purchase
    .withColumn("purchase_id", F.col("purchaseid").cast("long"))
    .withColumn("consumer_id", F.col("consumerid").cast("long"))
    .withColumn("graphed_date", F.to_date(F.col("graphed_date"), "yyyy-MM-dd"))
)

purchase_malformed = purchase.filter(F.col("purchase_id").isNull() | F.col("consumer_id").isNull())
purchase_valid = purchase.filter(F.col("purchase_id").isNotNull() & F.col("consumer_id").isNotNull())

# Deduplicate by latest graphed_date
w = Window.partitionBy("purchase_id").orderBy(F.col("graphed_date").desc_nulls_last())
purchase_valid = purchase_valid.withColumn("_rn", F.row_number().over(w)).filter(F.col("_rn")==1).drop("_rn")

purchase_valid.write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_DB}.validated_purchase")
if purchase_malformed.limit(1).count() > 0:
    write_to_quarantine(
        purchase_malformed,
        "purchase",
        "purchase_id",
        fk_expr=F.concat(F.lit("consumer_id="), F.col("consumerid")),
        failure_reason="invalid purchase"
    )

# -----------------------
# Avocado
# -----------------------
avocado = (
    bronze_avocado
    .withColumn("purchase_id", F.col("purchaseid").cast("long"))
    .withColumn("consumer_id", F.col("consumerid").cast("long"))
    .withColumn("born_date", F.to_date("born_date", "yyyy-MM-dd"))
    .withColumn("picked_date", F.to_date("picked_date", "yyyy-MM-dd"))
    .withColumn("sold_date", F.to_date("sold_date", "yyyy-MM-dd"))
    .withColumn("avocado_ripe_index", F.col("ripe_index_when_picked").cast("int"))
)

avocado_malformed = avocado.filter((F.col("picked_date") < F.col("born_date")) | (F.col("sold_date") < F.col("picked_date")))
avocado_valid = avocado.filter((F.col("picked_date") >= F.col("born_date")) & (F.col("sold_date") >= F.col("picked_date")))

avocado_valid.select(
    "purchase_id","consumer_id","avocado_bunch_id","born_date","picked_date","sold_date","avocado_ripe_index"
).write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_DB}.validated_avocado")

if avocado_malformed.limit(1).count() > 0:
    write_to_quarantine(
        avocado_malformed,
        "avocado",
        "purchase_id",
        fk_expr=F.concat(F.lit("consumer_id="), F.col("consumerid")),
        failure_reason="invalid dates"
    )

# -----------------------
# Fertilizer
# -----------------------
fert = (
    bronze_fertilizer
    .withColumn("purchase_id", F.col("purchaseid").cast("long"))
    .withColumn("consumer_id", F.col("consumerid").cast("long"))
    .withColumn("fertilizer_id", F.col("fertilizerid").cast("long"))
    .withColumn("fertilizer_type", F.initcap(F.col("type")))
)

fert_malformed = fert.filter(~F.col("fertilizer_type").isin(*VALID_FERT_TYPES))
fert_valid = fert.filter(F.col("fertilizer_type").isin(*VALID_FERT_TYPES))

fert_valid.select("purchase_id","consumer_id","fertilizer_id","fertilizer_type","mg").write.format("delta").mode("overwrite").saveAsTable(f"{SILVER_DB}.validated_fertilizer")
if fert_malformed.limit(1).count() > 0:
    write_to_quarantine(
        fert_malformed,
        "fertilizer",
        "fertilizer_id",
        fk_expr=F.concat(F.lit("purchase_id="), F.col("purchaseid"), F.lit(",consumer_id="), F.col("consumerid")),
        failure_reason="invalid fertilizer type"
    )
