# Mailchimp Silver ETL Notebook

This notebook implements the Bronze → Silver ETL pipeline. It reads raw JSON data files from the Bronze layer in ADLS,
flattens and cleans the Mailchimp member records, and then writes the processed data as CSV files into the Silver layer.
The pipeline includes partition-aware reading, improved logging, and directory management in Azure Data Lake Storage.


In [0]:
import os
import io
import json
import logging
import datetime
import re
import traceback
from typing import List

import pandas as pd
from pyspark.sql import SparkSession

# Azure
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient
from azure.core.exceptions import ResourceNotFoundError

# ------------------------------------------------------------------------------
# 1) Spark & Logging Setup (Databricks-Friendly)
# ------------------------------------------------------------------------------
spark = SparkSession.builder.appName("MailchimpSilverETL").getOrCreate()

logger = logging.getLogger("MailchimpSilverETL")
logger.setLevel(logging.INFO)

if logger.hasHandlers():
    logger.handlers.clear()

import sys
stream_handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

logger.info("Logger initialized successfully.")

# ------------------------------------------------------------------------------
# 2) Configuration
# ------------------------------------------------------------------------------
STORAGE_ACCOUNT_NAME = "mailchimpspnetwork"

BRONZE_CONTAINER = "bronze"
SILVER_CONTAINER = "silver"

BRONZE_PREFIX = "mailchimp_members"
SILVER_PREFIX = "mailchimp_clean"

# ------------------------------------------------------------------------------
# 3) Initialize ADLS
# ------------------------------------------------------------------------------
logger.info("Authenticating with Azure and initializing ADLS...")
credential = DefaultAzureCredential()
service_client = DataLakeServiceClient(
    account_url=f"https://{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net",
    credential=credential
)
bronze_fs = service_client.get_file_system_client(BRONZE_CONTAINER)
silver_fs = service_client.get_file_system_client(SILVER_CONTAINER)
logger.info("Connected to ADLS. Containers mounted successfully.")

# ------------------------------------------------------------------------------
# 4) Ensure Silver Directory Exists
# ------------------------------------------------------------------------------
def ensure_directory_exists(fs_client: FileSystemClient, directory_name: str):
    dir_client = fs_client.get_directory_client(directory_name)
    try:
        dir_client.get_directory_properties()
        logger.info(f"Directory exists: {directory_name}")
    except ResourceNotFoundError:
        logger.info(f"Creating directory: {directory_name}")
        fs_client.create_directory(directory_name)

# ------------------------------------------------------------------------------
# 5) List Mailchimp Lists (from Bronze layout)
# ------------------------------------------------------------------------------
def list_mailchimp_lists() -> List[str]:
    list_names = set()
    logger.info(f"Scanning for Mailchimp lists under '{BRONZE_PREFIX}'...")

    try:
        paths = bronze_fs.get_paths(path=BRONZE_PREFIX, recursive=True)
    except ResourceNotFoundError:
        logger.warning(f"No path found at '{BRONZE_PREFIX}'")
        return []

    for path in paths:
        if path.is_directory and "listName=" in path.name:
            parts = path.name.split('/')
            for part in parts:
                if part.startswith("listName="):
                    list_name = part.replace("listName=", "")
                    list_names.add(list_name)

    return sorted(list_names)

# ------------------------------------------------------------------------------
# 6) Read and Merge All JSON Pages for a List
# ------------------------------------------------------------------------------
def read_bronze_json(list_name: str) -> pd.DataFrame:
    logger.info(f"Reading latest ingestion JSONs for list '{list_name}'...")

    base_path = f"{BRONZE_PREFIX}/listName={list_name}"
    try:
        paths = list(bronze_fs.get_paths(path=base_path, recursive=False))
    except ResourceNotFoundError:
        logger.warning(f"No path found at '{base_path}'")
        return pd.DataFrame()

    # Filter ingestion_date folders and get the most recent one by modified time
    ingestion_dirs = [
        p for p in paths if p.is_directory and "ingestion_date=" in p.name
    ]
    if not ingestion_dirs:
        logger.warning(f"No ingestion folders found for '{list_name}'")
        return pd.DataFrame()

    latest_dir = max(ingestion_dirs, key=lambda p: p.last_modified)
    logger.info(f"Using latest ingestion folder: {latest_dir.name}")

    # Read JSONs within that latest folder
    all_records = []
    try:
        files = bronze_fs.get_paths(path=latest_dir.name, recursive=True)
        for f in files:
            if f.is_directory or not f.name.lower().endswith(".json"):
                continue
            try:
                file_client = bronze_fs.get_file_client(f.name)
                content = file_client.download_file().readall()
                records = json.loads(content)
                if isinstance(records, dict):
                    records = [records]
                all_records.extend(records)
            except Exception as e:
                logger.error(f"Failed to parse JSON in {f.name}: {e}")
    except Exception as e:
        logger.error(f"Error accessing files in {latest_dir.name}: {e}")
        return pd.DataFrame()

    logger.info(f"Loaded {len(all_records)} records from {latest_dir.name}")
    df = pd.DataFrame(all_records)
    df["list_name"] = list_name
    return df


    # # Optional: extract unique list_id if present in records
    # if "list_id" in df.columns:
    #     list_id = df["list_id"].dropna().unique()
    #     logger.info(f"Detected list_id(s): {list_id}")
    #     if len(list_id) == 1:
    #         df["list_id"] = list_id[0]
    #     # If multiple IDs, keep as-is

    # Inject list_name for lineage
    df["list_name"] = list_name

    return df


# ------------------------------------------------------------------------------
# 7) Flatten and Clean Member Records
# ------------------------------------------------------------------------------
def flatten_and_clean(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("Flattening and cleaning member data...")

    standard_cols = [
        "id", "email_address", "unique_email_id", "contact_id", "full_name",
        "web_id", "email_type", "status", "consents_to_one_to_one_messaging",
        "ip_signup", "timestamp_signup", "ip_opt", "timestamp_opt",
        "member_rating", "last_changed", "language", "vip", "email_client",
        "source"
    ]

    def flatten_row(row: dict) -> dict:
        if "merge_fields" in row and isinstance(row["merge_fields"], dict):
            for k, v in row["merge_fields"].items():
                row[f"merge_{k}"] = v
            row.pop("merge_fields", None)

        if "location" in row and isinstance(row["location"], dict):
            for k, v in row["location"].items():
                row[f"location_{k}"] = v
            row.pop("location", None)

        if "stats" in row and isinstance(row["stats"], dict):
            for k, v in row["stats"].items():
                row[f"stats_{k}"] = v
            row.pop("stats", None)

        return row

    flattened = [flatten_row(r) for r in df.to_dict(orient="records")]
    df_flat = pd.DataFrame(flattened)

    keep_cols = set(standard_cols)
    keep_cols.update([col for col in df_flat.columns if col.startswith(("merge_", "location_", "stats_"))])
    ordered = ["list_name"] + sorted(c for c in df_flat.columns if c not in ["list_name"])
    df_flat = df_flat[[c for c in ordered if c in df_flat.columns]]

    if "email_address" in df_flat.columns and "last_changed" in df_flat.columns:
        df_flat["last_changed"] = pd.to_datetime(df_flat["last_changed"], errors="coerce")
        df_flat.sort_values(by="last_changed", ascending=False, inplace=True)
        df_flat.drop_duplicates(subset=["email_address"], keep="first", inplace=True)

    df_flat.reset_index(drop=True, inplace=True)
    logger.info(f"Flattened DataFrame shape: {df_flat.shape}")
    return df_flat

# ------------------------------------------------------------------------------
# 8) Write to Silver (overwrite)
# ------------------------------------------------------------------------------
def write_to_silver(df: pd.DataFrame, list_name: str):
    output_path = f"{SILVER_PREFIX}/{list_name}.csv"
    buffer = io.StringIO()
    df.to_csv(buffer, index=False)

    file_client = silver_fs.get_file_client(output_path)
    file_client.upload_data(buffer.getvalue(), overwrite=True)
    logger.info(f"Wrote {len(df)} cleaned records to Silver: {output_path}")

# ------------------------------------------------------------------------------
# 9) Orchestrator: Process All Lists
# ------------------------------------------------------------------------------
def process_all_lists():
    ensure_directory_exists(silver_fs, SILVER_PREFIX)

    list_names = list_mailchimp_lists()
    logger.info(f"Found {len(list_names)} list(s): {list_names}")

    for list_name in list_names:
        logger.info(f"Processing list: {list_name}")
        df_bronze = read_bronze_json(list_name)

        if df_bronze.empty:
            logger.warning(f"No data found for '{list_name}', skipping.")
            continue

        df_clean = flatten_and_clean(df_bronze)
        write_to_silver(df_clean, list_name)

    logger.info("Bronze-to-Silver ETL complete.")

# ------------------------------------------------------------------------------
# 10) Execute
# ------------------------------------------------------------------------------
process_all_lists()
