In [0]:
# mailchimp_bronze_etl.ipynb

# 1) Notebook Configuration & Setup

import os
import json
import datetime
import re
import logging
import sys
import traceback
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict

import mailchimp_marketing as MailchimpMarketing
from mailchimp_marketing.api_client import ApiClientError

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient

# 1.1) Logging Setup (Databricks-friendly)

logger = logging.getLogger("MailchimpBronzeETL")
logger.setLevel(logging.INFO)

# Prevent duplicate handlers if re-run
if logger.hasHandlers():
    logger.handlers.clear()

# Log output to notebook/stdout
stream_handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

logger.info("Logger initialized and working in Databricks notebook.")


# 2) Configurable Parameters

MAILCHIMP_REGION = "us4"
MAILCHIMP_API_KEY = dbutils.secrets.get(scope="MailchimpSpnetwork", key="MailChimp-API-key")

RETENTION_DAYS = 30
MAX_LISTS_PAGE_SIZE = 1000
PAGE_SIZE = 1000
MAX_WORKERS = 5

ACCOUNT_NAME = "mailchimpspnetwork"
BRONZE_CONTAINER = "bronze"

BRONZE_MAILCHIMP_DIR_BASE = "mailchimp_members"


# 3) Initialize Azure Data Lake & Mailchimp Clients

logger.info("Initializing ADLS credentials and client.")
credential = DefaultAzureCredential()
service_client = DataLakeServiceClient(
    account_url=f"https://{ACCOUNT_NAME}.dfs.core.windows.net",
    credential=credential
)
container_client = service_client.get_file_system_client(BRONZE_CONTAINER)
logger.info("ADLS connection established.")

logger.info("Initializing Mailchimp client.")
client = MailchimpMarketing.Client()
client.set_config({
    "api_key": MAILCHIMP_API_KEY,
    "server": MAILCHIMP_REGION
})
logger.info(f"Mailchimp client initialized for region '{MAILCHIMP_REGION}'.")


# 4) Helper Functions

def upload_raw_json(folder_path: str, file_name: str, raw_json: str):
    file_path = f"{folder_path}/{file_name}"
    file_client = container_client.get_file_client(file_path)
    file_client.upload_data(raw_json.encode("utf-8"), overwrite=True)
    logger.debug(f"Uploaded {file_name} to ADLS at {folder_path}")


def cleanup_old_files():
    logger.info(f"Cleaning up files older than {RETENTION_DAYS} days.")
    cutoff_time = datetime.datetime.utcnow() - datetime.timedelta(days=RETENTION_DAYS)
    
    paths = container_client.get_paths(path=BRONZE_MAILCHIMP_DIR_BASE, recursive=True)
    delete_count = 0
    
    for path in paths:
        if not path.is_directory and path.last_modified < cutoff_time:
            logger.debug(f"Deleting {path.name} (Last modified: {path.last_modified})")
            container_client.delete_file(path.name)
            delete_count += 1
    
    logger.info(f"Cleanup completed. {delete_count} files deleted.")


def fetch_all_lists() -> List[Dict]:
    offset = 0
    all_lists = []
    
    while True:
        try:
            response = client.lists.get_all_lists(count=MAX_LISTS_PAGE_SIZE, offset=offset)
        except ApiClientError as error:
            logger.error(f"Failed to fetch lists at offset {offset}: {error.text}")
            break
        
        lists_page = response.get("lists", [])
        total_items = response.get("total_items", len(lists_page))
        
        all_lists.extend(lists_page)
        offset += MAX_LISTS_PAGE_SIZE
        
        if offset >= total_items:
            break

    logger.info(f"Retrieved {len(all_lists)} Mailchimp lists.")
    return all_lists


def safe_sanitize_list_name(raw_name: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_-]", "_", raw_name)

def delete_existing_files(folder_path: str):
    try:
        paths = container_client.get_paths(path=folder_path, recursive=False)
        for path in paths:
            if not path.is_directory:
                container_client.delete_file(path.name)
                logger.info(f"Deleted existing file: {path.name}")
    except Exception as e:
        logger.warning(f"Could not clean folder {folder_path}: {str(e)}")

def fetch_list_members(list_info: Dict, last_run: str = None):
    list_name_clean = safe_sanitize_list_name(list_info["name"])
    list_id = list_info["id"]

    ingestion_date_str = datetime.datetime.utcnow().strftime("%Y-%m-%d")
    folder_path = f"{BRONZE_MAILCHIMP_DIR_BASE}/listName={list_name_clean}/ingestion_date={ingestion_date_str}"

    logger.info(f"Cleaning up existing files for list '{list_name_clean}' in {folder_path}")
    delete_existing_files(folder_path)

    offset = 0
    page_num = 1

    logger.info(f"Fetching ALL members for list '{list_info['name']}' (ID: {list_id})")

    while True:
        try:
            response = client.lists.get_list_members_info(
                list_id=list_id,
                count=PAGE_SIZE,
                offset=offset
            )
        except ApiClientError as error:
            logger.error(f"Failed to fetch members for {list_name_clean} at offset {offset}: {error.text}")
            break

        members = response.get("members", [])
        if not members:
            logger.info(f"No members returned for {list_name_clean} at offset {offset}. Ending pagination.")
            break

        timestamp_str = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
        file_name = f"{list_name_clean}_page{page_num}_{timestamp_str}.json"
        upload_raw_json(folder_path, file_name, json.dumps(members))

        logger.info(f"Uploaded page {page_num} with {len(members)} members for list '{list_name_clean}'.")

        if len(members) < PAGE_SIZE:
            logger.info(f"Reached last page for list '{list_name_clean}'.")
            break

        offset += PAGE_SIZE
        page_num += 1



def fetch_and_store_members(all_lists: List[Dict]):
    logger.info(f"Fetching members for {len(all_lists)} lists in parallel (max_workers={MAX_WORKERS}).")
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(fetch_list_members, lst) for lst in all_lists]
        for f in futures:
            try:
                f.result()
            except Exception as e:
                logger.error(f"Error in parallel fetch: {e}\n{traceback.format_exc()}")
    logger.info("All lists processed in parallel.")


# 5) Main Orchestrator

def run_mailchimp_bronze_etl():
    logger.info("Mailchimp ETL started (FULL extraction, no incrementals).")

    # Fetch all Mailchimp lists
    all_lists = fetch_all_lists()
    if not all_lists:
        logger.warning("No lists found in Mailchimp. Exiting early.")
        return

    # Fetch and store members (no incremental filter)
    fetch_and_store_members(all_lists)

    # Cleanup old files
    cleanup_old_files()

    logger.info("Mailchimp ETL Process Completed Successfully.")


# 6) Final Cell: Execute the ETL (Databricks doesn't use __main__)
run_mailchimp_bronze_etl()
