In [0]:
# generate_gold_output.py
from pyspark.sql import functions as F
from datetime import datetime
import yaml
import re

# -------------------------
# Config
# -------------------------
config_path = "/Workspace/Users/clarkscoberly@gmail.com/config.yaml"
with open(config_path) as f:
    config = yaml.safe_load(f)

S3_BUCKET = config["s3_bucket"].rstrip("/")         
OUTPUT_PREFIX = config.get("output_prefix", "outputs")  
ITERATION = config.get("iteration", 1)             

# -------------------------
# Load Silver Tables
# -------------------------
pur = spark.table("silver.validated_purchase").alias("p") 
con = spark.table("silver.validated_consumer").alias("c")
avo = spark.table("silver.validated_avocado").alias("a")
fer = spark.table("silver.validated_fertilizer").alias("f")

# -------------------------
# Joins (explicit, unambiguous)
# -------------------------
# Use purchase.consumer_id as the canonical consumer_id to avoid ambiguity.
joined = (
    pur.join(con, F.col("p.consumer_id") == F.col("c.consumer_id"), how="inner")
       .select(
           F.col("p.purchase_id").alias("purchase_id"),
           F.col("c.consumer_id").alias("consumer_id"),   
           F.col("c.sex").alias("Sex"),                   
           F.col("c.age").alias("age"),
           F.col("p.graphed_date").alias("graphed_date"),
           F.col("p.avocado_bunch_id").alias("avocado_bunch_id")
       )
)

joined_avo = joined.join(avo, on=F.col("purchase_id") == F.col("a.purchase_id"), how="left") \
                   .select(
                       "purchase_id",
                       "consumer_id",
                       "Sex",
                       "age",
                       "graphed_date",
                       "avocado_bunch_id",
                       F.col("a.born_date").alias("born_date"),
                       F.col("a.picked_date").alias("picked_date"),
                       F.col("a.sold_date").alias("sold_date"),
                       F.col("a.avocado_ripe_index").alias("avocado_ripe_index")
                   )

final_df = joined_avo.join(fer, on=F.col("purchase_id") == F.col("f.purchase_id"), how="left") \
    .select(
        F.col("purchase_id"),
        F.col("consumer_id"),
        F.col("Sex"),
        F.col("age"),
        # Derived metrics (use datediff; keep null if dates missing)
        F.datediff(F.col("sold_date"), F.col("born_date")).alias("avocado_days_sold"),
        F.col("avocado_ripe_index"),
        F.datediff(F.col("picked_date"), F.col("born_date")).alias("avocado_days_picked"),
        F.col("fertilizer_type")
    )

# -------------------------
# Persist Gold Delta table (authoritative)
# -------------------------
spark.sql("CREATE DATABASE IF NOT EXISTS gold")
(
    final_df.write
    .format("delta")
    .mode("merge")         
    .option("mergeSchema", "true")
    .saveAsTable("gold.gold_output")
)

# -------------------------
# CSV Output: file spec
# -------------------------

date_str = datetime.now().strftime("%Y%m%d")   # e.g. 20251121
filename = f"target_{ITERATION}_{date_str}.csv"

# Output path staging (will write into this folder, then rename the part file)
output_staging = f"{S3_BUCKET}/{OUTPUT_PREFIX}/{filename}.tmp/"
final_output_path = f"{S3_BUCKET}/{OUTPUT_PREFIX}/{filename}"

# Ensure header columns and spellings match spec exactly:
# Header row will be: consumer_id|Sex|age|avocado_days_sold|avocado_ripe_index|avocado_days_picked|fertilizer_type
ordered_df = final_df.select(
    "consumer_id",
    "Sex",
    "age",
    "avocado_days_sold",
    "avocado_ripe_index",
    "avocado_days_picked",
    "fertilizer_type"
)

# Coalesce to single file (if dataset is small enough; for large data this is NOT recommended).
single_csv_df = ordered_df.coalesce(1)

single_csv_df.write \
    .option("header", "true") \
    .option("sep", "|") \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("encoding", "ASCII") \
    .option("lineSep", "\n") \
    .mode("overwrite") \
    .csv(output_staging)

# Move / rename the produced part-*.csv to the target filename (Databricks helper).
# WARNING: dbutils is Databricks-specific.
try:
    files = dbutils.fs.ls(output_staging)
    part_file = None
    for f in files:
        # find the first CSV part file (part-*.csv)
        if re.search(r"part-.*\.csv$", f.name):
            part_file = f.path
            break

    if part_file is None:
        raise Exception(f"No part csv found in {output_staging}; list: {files}")

    # If an output file already exists, remove it first (overwrite behavior)
    if len(dbutils.fs.ls(f"{S3_BUCKET}/{OUTPUT_PREFIX}")) > 0:
        # Remove existing file with same name (if present)
        try:
            dbutils.fs.rm(final_output_path)
        except Exception:
            pass

    dbutils.fs.mv(part_file, final_output_path)
    # cleanup temp folder
    dbutils.fs.rm(output_staging, recurse=True)
    print(f"Wrote CSV to: {final_output_path}")

# TODO Add in email chains to signify ETL was successful to determined owners
email()
