In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

STORAGE_ACCOUNT_NAME = "mailchimpspnetwork"
GOLD_CONTAINER = "gold"
SILVER_CONTAINER = "silver"
INPUT_PREFIX = "mailchimp_transformed"

GOLD_BASE = f"abfss://{GOLD_CONTAINER}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net"
SILVER_BASE = f"abfss://{SILVER_CONTAINER}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net/{INPUT_PREFIX}"

# 📂 Load and union all delta files
df_paths = dbutils.fs.ls(SILVER_BASE)
delta_paths = [f.path for f in df_paths if f.isDir()]

all_data = None
for path in delta_paths:
    df = spark.read.format("delta").load(path)
    df = df.withColumn("list_name_source", lit(path.split("/")[-1]))
    all_data = df if all_data is None else all_data.unionByName(df)

print(f"✅ Loaded {all_data.count()} records from {len(delta_paths)} lists")

# -----------------------------------
# 1️⃣ dim_contact (based on unique_email_id)
# -----------------------------------
dim_contact = all_data.select(
    "unique_email_id",
    "email_address",
    "full_name",
    "merge_FNAME",
    "merge_LNAME",
    "merge_PHONE",
    "address_addr1",
    "address_addr2",
    "address_city",
    "address_state",
    "address_zip",
    "address_country",
    "language",
    "vip"
).dropDuplicates(["unique_email_id"])

dim_contact.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/dim_contact")
print("✅ dim_contact written to gold")

# -----------------------------------
# 2dim_list (same)
# -----------------------------------
dim_list = all_data.select(
    "list_id",
    "list_name",
    "location_country_code",
    "location_region",
    "location_timezone"
).dropDuplicates(["list_id"])

dim_list.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/dim_list")
print("✅ dim_list written to gold")

# -----------------------------------
# 3fact_list_membership (new FK = unique_email_id)
# -----------------------------------
fact_membership = all_data.select(
    "unique_email_id",
    "list_id",
    "status",
    "stats_avg_open_rate",
    "stats_avg_click_rate",
    "timestamp_signup",
    "timestamp_opt",
    "last_changed",
    "member_rating",
    "consents_to_one_to_one_messaging",
    "email_client",
    "email_type",
    "web_id"
).dropDuplicates(["unique_email_id", "list_id"])

fact_membership.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/fact_list_membership")
print("fact_list_membership written to gold")
