# Mailchimp Bronze ETL Notebook

This notebook orchestrates a full extraction of Mailchimp lists and their members.
It connects to Mailchimp's API, fetches all lists and members, uploads the data to Azure Data Lake Storage,
and finally performs cleanup of old files.


In [0]:
# mailchimp_bronze_etl.ipynb

# 1) Notebook Configuration & Setup

import os
import json
import datetime
import re
import logging
import sys
import traceback
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict

import mailchimp_marketing as MailchimpMarketing
from mailchimp_marketing.api_client import ApiClientError

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient

# Set up the logger
logger = logging.getLogger("MailchimpBronzeETL")
logger.setLevel(logging.INFO)

# Prevent duplicate handlers if re-run
if logger.hasHandlers():
    logger.handlers.clear()

# Log output to notebook/stdout with a specific format
stream_handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

logger.info("Logger initialized and working in Databricks notebook.")


# 2) Configurable Parameters

MAILCHIMP_REGION = "us4"  # Mailchimp server region
# Get the API key from the secret store (Databricks dbutils)
MAILCHIMP_API_KEY = dbutils.secrets.get(scope="MailchimpSpnetwork", key="MailChimp-API-key")

RETENTION_DAYS = 30  # Days to keep files before cleanup
MAX_LISTS_PAGE_SIZE = 1000  # Maximum number of lists to fetch per page
PAGE_SIZE = 1000  # Number of list members to fetch per page
MAX_WORKERS = 5  # Maximum number of parallel threads for fetching data

ACCOUNT_NAME = "mailchimpspnetwork"  # Azure Data Lake account name
BRONZE_CONTAINER = "bronze"  # Azure container name for bronze-level data

BRONZE_MAILCHIMP_DIR_BASE = "mailchimp_members"  # Base directory for storing Mailchimp data in ADLS



# ## 3) Initialize Azure Data Lake & Mailchimp Clients
#
# We create authenticated clients for Azure Data Lake Storage and Mailchimp API.

logger.info("Initializing ADLS credentials and client.")
# Use DefaultAzureCredential for Azure authentication
credential = DefaultAzureCredential()
# Create a DataLakeServiceClient using the ADLS account URL and credential
service_client = DataLakeServiceClient(
    account_url=f"https://{ACCOUNT_NAME}.dfs.core.windows.net",
    credential=credential
)
# Get a client for the bronze container
container_client = service_client.get_file_system_client(BRONZE_CONTAINER)
logger.info("ADLS connection established.")

logger.info("Initializing Mailchimp client.")
# Initialize the Mailchimp client with the provided API key and region
client = MailchimpMarketing.Client()
client.set_config({
    "api_key": MAILCHIMP_API_KEY,
    "server": MAILCHIMP_REGION
})
logger.info(f"Mailchimp client initialized for region '{MAILCHIMP_REGION}'.")



##  Helper Functions

The following functions support various tasks in the ETL:
- **upload_raw_json:** Uploads JSON data to a specified folder in ADLS.
- **cleanup_old_files:** Deletes files older than a retention threshold.
- **fetch_all_lists:** Retrieves all Mailchimp lists.
- **safe_sanitize_list_name:** Cleans list names to be file system friendly.
- **delete_existing_files:** Removes files from a folder before new data is written.
- **fetch_list_members:** Fetches members for a given list, handling pagination and storing results.
- **fetch_and_store_members:** Runs the member extraction in parallel across lists.


In [0]:

def cleanup_old_folders():
    logger.info(f"Cleaning up ingestion folders older than {RETENTION_DAYS} days.")
    cutoff_time = datetime.datetime.utcnow() - datetime.timedelta(days=RETENTION_DAYS)

    # Track each ingestion_date folder's latest modified file
    folder_last_modified_map = {}
    paths = container_client.get_paths(path=BRONZE_MAILCHIMP_DIR_BASE, recursive=True)

    for path in paths:
        if path.is_directory:
            continue
        # Example path: mailchimp_members/listName=XYZ/ingestion_date=2025-03-27/file.json
        parts = path.name.split('/')
        if len(parts) < 3:
            continue

        # Identify the folder path: listName=X/ingestion_date=Y
        folder_path = '/'.join(parts[:3])
        last_modified = path.last_modified

        if folder_path not in folder_last_modified_map:
            folder_last_modified_map[folder_path] = last_modified
        else:
            folder_last_modified_map[folder_path] = max(folder_last_modified_map[folder_path], last_modified)

    delete_count = 0

    for folder_path, last_modified in folder_last_modified_map.items():
        if last_modified < cutoff_time:
            logger.info(f"Deleting folder '{folder_path}' (Last modified: {last_modified})")

            # Delete all files in this folder
            folder_files = [p for p in paths if p.name.startswith(folder_path + '/') and not p.is_directory]
            for file in folder_files:
                try:
                    container_client.delete_file(file.name)
                    logger.debug(f"Deleted file: {file.name}")
                except Exception as e:
                    logger.warning(f"Failed to delete file: {file.name} — {e}")

            # Try deleting the directory itself (optional, since ADLS Gen2 may manage "folders" virtually)
            try:
                container_client.delete_directory(folder_path)
                logger.debug(f"Deleted directory: {folder_path}")
            except Exception as e:
                logger.debug(f"Skipped directory deletion (virtual folders): {e}")

            delete_count += 1

    logger.info(f"Cleanup completed. {delete_count} folders deleted.")


    # Delete folders where the most recent file is older than cutoff
    for folder, last_modified in folder_last_modified_map.items():
        if last_modified < cutoff_time:
            logger.debug(f"Deleting folder: {folder} (Last modified: {last_modified})")
            # Delete all files in the folder
            folder_files = [p for p in paths if p.name.startswith(folder + '/') and not p.is_directory]
            for file in folder_files:
                container_client.delete_file(file.name)
            delete_count += 1

    logger.info(f"Folder cleanup completed. {delete_count} folders deleted.")


In [0]:
def upload_raw_json(folder_path: str, file_name: str, raw_json: str):
    # Construct the file path in ADLS
    file_path = f"{folder_path}/{file_name}"
    # Get a client for the file and upload the JSON data
    file_client = container_client.get_file_client(file_path)
    file_client.upload_data(raw_json.encode("utf-8"), overwrite=True)
    logger.debug(f"Uploaded {file_name} to ADLS at {folder_path}")

def fetch_all_lists() -> List[Dict]:
    offset = 0
    all_lists = []
    
    while True:
        try:
            # Fetch a page of Mailchimp lists starting at the current offset
            response = client.lists.get_all_lists(count=MAX_LISTS_PAGE_SIZE, offset=offset)
        except ApiClientError as error:
            logger.error(f"Failed to fetch lists at offset {offset}: {error.text}")
            break
        
        # Extract lists from the response
        lists_page = response.get("lists", [])
        total_items = response.get("total_items", len(lists_page))
        
        all_lists.extend(lists_page)
        offset += MAX_LISTS_PAGE_SIZE
        
        # Stop if all lists have been retrieved
        if offset >= total_items:
            break

    logger.info(f"Retrieved {len(all_lists)} Mailchimp lists.")
    return all_lists

def safe_sanitize_list_name(raw_name: str) -> str:
    # Replace any characters that are not alphanumeric, underscore, or hyphen with an underscore
    return re.sub(r"[^a-zA-Z0-9_-]", "_", raw_name)

def delete_existing_files(folder_path: str):
    try:
        # Get paths in the specified folder (non-recursive)
        paths = container_client.get_paths(path=folder_path, recursive=False)
        for path in paths:
            if not path.is_directory:
                container_client.delete_file(path.name)
                logger.info(f"Deleted existing file: {path.name}")
    except Exception as e:
        logger.warning(f"Could not clean folder {folder_path}: {str(e)}")

def fetch_list_members(list_info: Dict, last_run: str = None):
    # Clean the list name to be used in file paths
    list_name_clean = safe_sanitize_list_name(list_info["name"])
    list_id = list_info["id"]

    # Set the ingestion date to the current UTC date
    ingestion_date_str = datetime.datetime.utcnow().strftime("%Y-%m-%d")
    folder_path = f"{BRONZE_MAILCHIMP_DIR_BASE}/listName={list_name_clean}/ingestion_date={ingestion_date_str}"

    logger.info(f"Cleaning up existing files for list '{list_name_clean}' in {folder_path}")
    delete_existing_files(folder_path)

    offset = 0
    page_num = 1

    logger.info(f"Fetching ALL members for list '{list_info['name']}' (ID: {list_id})")

    while True:
        try:
            # Fetch a page of list members using pagination
            response = client.lists.get_list_members_info(
                list_id=list_id,
                count=PAGE_SIZE,
                offset=offset
            )
        except ApiClientError as error:
            logger.error(f"Failed to fetch members for {list_name_clean} at offset {offset}: {error.text}")
            break

        members = response.get("members", [])
        if not members:
            logger.info(f"No members returned for {list_name_clean} at offset {offset}. Ending pagination.")
            break

        # Generate a timestamp for the file name to avoid collisions
        timestamp_str = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
        file_name = f"{list_name_clean}_page{page_num}_{timestamp_str}.json"
        # Upload the JSON data for the current page
        upload_raw_json(folder_path, file_name, json.dumps(members))

        logger.info(f"Uploaded page {page_num} with {len(members)} members for list '{list_name_clean}'.")

        # If the page returned fewer members than PAGE_SIZE, it's the last page
        if len(members) < PAGE_SIZE:
            logger.info(f"Reached last page for list '{list_name_clean}'.")
            break

        offset += PAGE_SIZE  # Move to the next page offset
        page_num += 1

def fetch_and_store_members(all_lists: List[Dict]):
    logger.info(f"Fetching members for {len(all_lists)} lists in parallel (max_workers={MAX_WORKERS}).")
    # Use ThreadPoolExecutor for parallel processing of multiple lists
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(fetch_list_members, lst) for lst in all_lists]
        # Wait for all parallel tasks to complete, and log any errors
        for f in futures:
            try:
                f.result()
            except Exception as e:
                logger.error(f"Error in parallel fetch: {e}\n{traceback.format_exc()}")
    logger.info("All lists processed in parallel.")


## Main Orchestrator

The function `run_mailchimp_bronze_etl()` ties all the steps together:
1. Fetch all Mailchimp lists.
2. For each list, fetch and store all member data in parallel.
3. Clean up old files from Azure Data Lake.

It logs the start and end of the process along with any issues.


In [0]:
def run_mailchimp_bronze_etl():
    logger.info("Mailchimp ETL started (FULL extraction, no incrementals).")

    # Fetch all Mailchimp lists
    all_lists = fetch_all_lists()
    if not all_lists:
        logger.warning("No lists found in Mailchimp. Exiting early.")
        return
    cleanup_old_folders()

    # Fetch and store members (no incremental filter)
    fetch_and_store_members(all_lists)


    logger.info("Mailchimp ETL Process Completed Successfully.")

# Execute the ETL process
run_mailchimp_bronze_etl()
