In [0]:
# generate_gold_output.py
from pyspark.sql import functions as F
from datetime import datetime
from pyspark.sql.window import Window
import yaml
import re

gold_table = "gold.gold_output"

if spark.catalog.tableExists(gold_table):
    # Option 1: Truncate existing table
    spark.sql(f"TRUNCATE TABLE {gold_table}")


# -------------------------
# Config
# -------------------------
config_path = "/Workspace/Users/clarkscoberly@gmail.com/_Projects/Data Engineering/weekly_avocado_etl/0_config.yaml"
with open(config_path) as f:
    config = yaml.safe_load(f)

S3_BUCKET = config["s3_bucket"].rstrip("/")
OUTPUT_PREFIX = config.get("output_prefix", "outputs")
ITERATION = config.get("iteration", 1)

# -------------------------
# Load Silver Tables
# -------------------------
pur = spark.table("silver.validated_purchase")
con = spark.table("silver.validated_consumer")
avo = spark.table("silver.validated_avocado")
fer = spark.table("silver.validated_fertilizer")

# -------------------------
# Join chain (left joins)
# -------------------------
joined_df = (
    con.alias("con")
    .join(pur.alias("pur"), "consumer_id", "left")
    .join(avo.alias("avo"), "purchase_id", "left")
    .join(fer.alias("fer"), "purchase_id", "left")
)

# -------------------------
# Select output columns with calculations
# -------------------------
final_df = joined_df.select(
    F.col("con.consumer_id").alias("consumer_id"),
    F.col("con.sex").alias("Sex"),
    F.col("con.age").alias("age"),
    F.col("pur.avocado_bunch_id").alias("avocado_bunch_id"),
    F.datediff(F.col("avo.sold_date"), F.col("avo.born_date")).alias("avocado_days_sold"),
    F.col("avo.avocado_ripe_index").alias("avocado_ripe_index"),
    F.datediff(F.col("avo.picked_date"), F.col("avo.born_date")).alias("avocado_days_picked"),
    F.col("fer.fertilizer_type").alias("fertilizer_type")
)

# -------------------------
# Deduplicate by natural keys (consumer_id + purchase_id + avocado_bunch_id + fertilizer_type)
# Only deduplicate if a true duplicate exists, avoids dropping valid combinations
# -------------------------
dedupe_cols = ["consumer_id", "avocado_bunch_id", "fertilizer_type"]
window_spec = Window.partitionBy(*dedupe_cols).orderBy(F.lit(1))
final_df = final_df.withColumn("_rn", F.row_number().over(window_spec)).filter(F.col("_rn") == 1).drop("_rn")

# -------------------------
# Persist Gold Delta table (authoritative)
# -------------------------
spark.sql("CREATE DATABASE IF NOT EXISTS gold")
(
    final_df.write
    .format("delta")
    .mode("append")
    .option("mergeSchema", "true")
    .saveAsTable("gold.gold_output")
)

# -------------------------
# CSV Output: file spec
# -------------------------
date_str = datetime.now().strftime("%Y%m%d")
filename = f"target_{ITERATION}_{date_str}.csv"
output_staging = f"{S3_BUCKET}/{OUTPUT_PREFIX}/{filename}.tmp/"
final_output_path = f"{S3_BUCKET}/{OUTPUT_PREFIX}/{filename}"

# Order columns for CSV
ordered_df = final_df.select(
    "consumer_id",
    "Sex",
    "age",
    "avocado_days_sold",
    "avocado_ripe_index",
    "avocado_days_picked",
    "fertilizer_type"
)

# Only deduplicate for CSV, after ordering
single_csv_df = ordered_df.dropDuplicates()

# -------------------------
# Write CSV
# -------------------------
try:
    single_csv_df.write \
        .option("header", "true") \
        .option("sep", "|") \
        .option("quote", '"') \
        .option("escape", '"') \
        .option("encoding", "ASCII") \
        .option("lineSep", "\n") \
        .mode("overwrite") \
        .csv(output_staging)

    # Move part file to final CSV location
    files = dbutils.fs.ls(output_staging)
    if not files:
        raise Exception(f"No files found in staging path: {output_staging}")

    part_file = next((f.path for f in files if re.search(r"part-.*\.csv$", f.name)), None)
    if part_file is None:
        raise Exception(f"No part CSV found in {output_staging}; list: {[f.name for f in files]}")

    # Remove existing CSV if present
    try:
        dbutils.fs.rm(final_output_path)
    except Exception:
        pass

    dbutils.fs.mv(part_file, final_output_path)
    dbutils.fs.rm(output_staging, recurse=True)
    print(f"Wrote CSV to: {final_output_path}")

except Exception as e:
    print(f"Error writing CSV: {e}")
    spark.sql("SELECT * FROM gold.gold_output").show(100, truncate=False)
    raise
