
# Data Processing Pipeline for Mailchimp Data

This script loads and unions multiple Delta files from a specified silver storage container,
transforms the data, and writes three Delta tables to a gold storage container.

The main steps include:
- Loading all Delta files from the silver container and appending the list name as a new column.
- Creating a contact dimension table (`dim_contact`) based on unique email IDs.
- Creating a list dimension table (`dim_list`) based on list IDs.
- Creating a fact table for list membership (`fact_list_membership`) with additional membership data.


In [0]:
# Import necessary modules and functions from PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *

# Create or get an existing SparkSession
spark = SparkSession.builder.getOrCreate()

# Define storage account and container information
STORAGE_ACCOUNT_NAME = "mailchimpspnetwork"
GOLD_CONTAINER = "gold"
SILVER_CONTAINER = "silver"
INPUT_PREFIX = "mailchimp_transformed"

# Build the base paths for the gold and silver layers
GOLD_BASE = f"abfss://{GOLD_CONTAINER}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net"
SILVER_BASE = f"abfss://{SILVER_CONTAINER}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net/{INPUT_PREFIX}"

# -------------------------------------------
# Load and union all delta files from the silver layer
# -------------------------------------------

# List all files and directories in the silver base path
df_paths = dbutils.fs.ls(SILVER_BASE)
# Filter the paths to only include directories (each directory represents a delta table for a list)
delta_paths = [f.path for f in df_paths if f.isDir()]

# Initialize a variable to hold the union of all data
all_data = None

# Loop through each delta file directory
for path in delta_paths:
    # Load the delta table from the current directory
    df = spark.read.format("delta").load(path)
    # Add a column to identify the source list name using the directory name
    df = df.withColumn("list_name_source", lit(path.split("/")[-1]))
    # Union the current DataFrame with the accumulated DataFrame
    all_data = df if all_data is None else all_data.unionByName(df)

# Log the total record count and number of lists processed
print(f"Loaded {all_data.count()} records from {len(delta_paths)} lists")

# -------------------------------------------
# 1. Create the dimension table 'dim_contact' based on unique_email_id
# -------------------------------------------
# Select required columns and remove duplicate records based on 'unique_email_id'
dim_contact = all_data.select(
    "unique_email_id",
    "email_address",
    "full_name",
    "merge_FNAME",
    "merge_LNAME",
    "merge_PHONE",
    "address_addr1",
    "address_addr2",
    "address_city",
    "address_state",
    "address_zip",
    "address_country",
    "language",
    "vip"
).dropDuplicates(["unique_email_id"])

# Write the dimension table to the gold layer using Delta format and overwrite mode
dim_contact.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/dim_contact")
print("dim_contact written to gold")

# -------------------------------------------
# 2. Create the dimension table 'dim_list'
# -------------------------------------------
# Select required columns and remove duplicate records based on 'list_id'
dim_list = all_data.select(
    "list_id",
    "list_name",
    "location_country_code",
    "location_region",
    "location_timezone"
).dropDuplicates(["list_id"])

# Write the dimension table to the gold layer using Delta format and overwrite mode
dim_list.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/dim_list")
print("dim_list written to gold")

# -------------------------------------------
# 3. Create the fact table 'fact_list_membership'
# -------------------------------------------
# Select required columns and remove duplicate records based on the combination of 'unique_email_id' and 'list_id'
fact_membership = all_data.select(
    "unique_email_id",
    "list_id",
    "status",
    "stats_avg_open_rate",
    "stats_avg_click_rate",
    "timestamp_signup",
    "timestamp_opt",
    "last_changed",
    "member_rating",
    "consents_to_one_to_one_messaging",
    "email_client",
    "email_type",
    "web_id"
).dropDuplicates(["unique_email_id", "list_id"])

# Write the fact table to the gold layer using Delta format and overwrite mode
fact_membership.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/fact_list_membership")
print("fact_list_membership written to gold")
