In [None]:
# User Modification Settings
config_file_path = f"configurations/outbound-s3-export.yaml"
workflow_name_prefix = "outbound_s3_export"
policy_id = "001014E447543471"
aws_profile_arn = "arn:aws:iam::951193945349:instance-profile/abacus-compute-pipeline-dev"
# notebook_base_path = "/Repos/chenxi.gao@abacusinsights.com/ng-pipelines-example/" #prod
notebook_base_path = "/Workspace/Users/chenxi.gao@abacusinsights.com/xport-1540-v2/" #dev
timeout_seconds = 1800  # 30 minutes
min_retry_interval_millis = 5000  # 5 seconds
retry_on_timeout_default = False
spark_version = "14.3.x-scala2.12"
node_type_id = "i4i.xlarge"
num_workers = 1


In [None]:
import os
import json
import yaml
import logging
from typing import Any, Dict

with open(config_file_path, 'r') as f:
    raw_cfg: Dict[str, Any] = yaml.safe_load(f)

settings = raw_cfg.get('settings', {})
catalog = settings.get('catalog')
schema = settings.get('schema')
destination_type = settings.get('destination_type', 's3')
groups = settings.get('groups', [])

if not catalog or not schema:
    raise ValueError("'catalog' and 'schema' must be defined in settings")


In [None]:
import requests
from databricks_cli.sdk.api_client import ApiClient
from databricks_cli.jobs.api import JobsApi

ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
api_token = ctx.apiToken().get()
api_url = ctx.apiUrl().get()

api_client = ApiClient(host=api_url, token=api_token)


def create_or_reset_job(job_settings: Dict[str, Any], jobs_reset: bool = True):
    job_name = job_settings['name']
    jobs_api = JobsApi(api_client)
    jobs = jobs_api.list_jobs(name=job_name)
    matching_jobs = (j for j in jobs.get("jobs", []) if j["settings"]["name"] == job_name)
    desired_job = next(matching_jobs, None)
    if desired_job is not None:
        logging.info(f"{job_name} already exists. Updating/Resetting...")
        current_job_id = desired_job["job_id"]
        new_job_settings = {"job_id": current_job_id, "new_settings": job_settings}
        endpoint = "reset" if jobs_reset else "update"
        response = requests.post(f"{api_url}/api/2.1/jobs/{endpoint}", json=new_job_settings,
                                 headers={"Authorization": f"Bearer {api_token}"})
        if response.status_code == 200:
            logging.info(f"Job successfully updated: {job_name}")
        else:
            raise ValueError(f"Job update failed: {response.json().get('message')}")
    else:
        response = requests.post(f"{api_url}/api/2.1/jobs/create", json=job_settings,
                                 headers={"Authorization": f"Bearer {api_token}"})
        if response.status_code == 200:
            logging.info(f"Job successfully created: {job_name}; id={response.json().get('job_id')}")
        else:
            raise ValueError(f"Job creation failed: {response.json().get('message')}")


In [None]:
# Build a single job per group with one task per table
job_cluster_single = [{
    "job_cluster_key": "s3_export_cluster",
    "new_cluster": {
        "spark_version": spark_version,
        "aws_attributes": {"instance_profile_arn": aws_profile_arn},
        "node_type_id": node_type_id,
        "driver_node_type_id": node_type_id,
        "policy_id": policy_id,
        "num_workers": num_workers,
    },
}]

jobs_payloads = []

for group in groups:
    group_name = group.get('group_name', 'default')
    alert_on_failure = str(group.get('alert_on_failure', 'false')).lower() == 'true'
    alert_emails = group.get('alert_emails', []) if alert_on_failure else []
    max_retries = int(group.get('max_retries', 0) or 0)
    schedule = group.get('schedule')

    tables = group.get('tables', [])
    if not isinstance(tables, list) or len(tables) == 0:
        logging.warning(f"Group {group_name} has no tables. Skipping.")
        continue

    tasks = []
    for idx, tbl in enumerate(tables, start=1):
        base_parameters = {
            "catalog": catalog,
            "schema": schema,
            "table_name": tbl.get('source_table'),
            "s3_bucket_keyword": tbl.get('s3_bucket_keyword', 'data-lake'),
            "s3_prefix": tbl.get('s3_prefix', ''),
            "s3_filename_generator": tbl.get('s3_filename_generator', 'default'),
            "s3_output_format": tbl.get('s3_output_format', 'parquet'),
            "auto_find_s3_by_keyword": tbl.get('auto_find_s3_by_keyword', 'true'),
            "s3_bucket_full_url": tbl.get('s3_bucket_full_url', ''),
        }
        if not base_parameters["table_name"]:
            raise ValueError("Each table entry must include 'source_table'")

        task = {
            "task_key": f"run_pipeline_{idx}",
            "source": "WORKSPACE",
            "notebook_task": {
                "notebook_path": f"{notebook_base_path}notebooks/outbounds/s3/run_pipeline",
                "base_parameters": base_parameters,
            },
            "job_cluster_key": "s3_export_cluster",
            "retry_on_timeout": (max_retries > 0) or retry_on_timeout_default,
            "timeout_seconds": timeout_seconds,
            "max_retries": max_retries,
            "min_retry_interval_millis": min_retry_interval_millis,
        }
        tasks.append(task)

    scheduler_dict = None
    if schedule:
        scheduler_dict = {"quartz_cron_expression": f"{schedule}", "timezone_id": "US/Eastern"}

    job_payload = {
        "settings": {
            "name": f"{workflow_name_prefix}_{group_name}",
            "max_concurrent_runs": 1,
            "timeout_seconds": timeout_seconds,
            "email_notifications": {
                "no_alert_for_skipped_runs": True,
                "on_failure": alert_emails,
            },
            "notification_settings": {
                "no_alert_for_skipped_runs": True,
                "no_alert_for_canceled_runs": True,
            },
            "webhook_notifications": {},
            "tasks": tasks,
            "job_clusters": job_cluster_single,
            "format": "MULTI_TASK",
            **({"schedule": scheduler_dict} if scheduler_dict else {}),
        }
    }
    jobs_payloads.append(job_payload)

for job in jobs_payloads:
    create_or_reset_job(job["settings"], jobs_reset=True)
    logging.info("Job created/updated successfully")
