In [0]:
import re

def extract_all_function_names(code):
    """
    Extract all function names defined in a Python file.
    """
    pattern = r"def\s+(\w+)\s*\("
    return re.findall(pattern, code)

In [0]:
import yaml
import os
from collections import defaultdict, deque

# -----------------------------
# CONFIG
# -----------------------------
PIPELINES_ROOT = "/Workspace/Users/divakar.c@diggibyte.com/Demo_pipelines"
OUTPUT_DIR = "/Workspace/Users/aakanksha.shrivas@diggibyte.com/DealShare-Project/Dealshare_Standard_Pipeline/Standard_Pipelines"

os.makedirs(OUTPUT_DIR, exist_ok=True)

METADATA_FILE_NAME = "metadata.yaml"

# =========================
# DEFAULT CELL (MANDATORY)
# =========================
DEFAULT_CODE = """# Databricks notebook source
import sys
import os

notebook_dir = os.path.dirname(os.path.abspath('__file__'))
mage_ai_path = os.path.join(
    notebook_dir,
    '/Workspace/Users/aakanksha.shrivas@diggibyte.com/DealShare-Project/mage-ai'
)

print(f"Adding to sys.path: {mage_ai_path}")
if mage_ai_path not in sys.path:
    sys.path.insert(0, mage_ai_path)

import mage_ai
print(f"Successfully imported mage_ai from: {mage_ai.__file__}")
"""

# -----------------------------
# UTIL FUNCTIONS
# -----------------------------
def read_block_code(pipeline_dir, uuid):
    file_path = os.path.join(pipeline_dir, f"{uuid}.py")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Missing block file: {file_path}")

    with open(file_path, "r") as f:
        return f.read().strip()


def generate_notebook(pipeline_dir):
    metadata_path = os.path.join(pipeline_dir, METADATA_FILE_NAME)

    with open(metadata_path, "r") as f:
        metadata = yaml.safe_load(f)

    blocks = metadata["blocks"]
    pipeline_name = metadata.get("name", os.path.basename(pipeline_dir))

    # -----------------------------
    # BUILD DAG
    # -----------------------------
    block_map = {b["uuid"]: b for b in blocks}
    graph = defaultdict(list)
    in_degree = defaultdict(int)

    for b in blocks:
        for upstream in b.get("upstream_blocks", []):
            graph[upstream].append(b["uuid"])
            in_degree[b["uuid"]] += 1

    for b in block_map:
        in_degree.setdefault(b, 0)

    # -----------------------------
    # TOPOLOGICAL SORT
    # -----------------------------
    queue = deque([b for b in in_degree if in_degree[b] == 0])
    execution_order = []

    while queue:
        node = queue.popleft()
        execution_order.append(node)
        for downstream in graph[node]:
            in_degree[downstream] -= 1
            if in_degree[downstream] == 0:
                queue.append(downstream)

    if len(execution_order) != len(block_map):
        raise Exception(f"Cycle detected in pipeline: {pipeline_name}")

    # -----------------------------
    # NOTEBOOK BUILD
    # -----------------------------
    lines = []

    def add_cell(content):
        lines.append("# COMMAND ----------")
        lines.append(content)
        lines.append("")

    # Header
    lines.append("# Databricks notebook source")
    lines.append("# AUTO-GENERATED FROM MAGE")
    lines.append(f"# Pipeline: {pipeline_name}")
    lines.append("")


    # CELL 1: DEFAULT CODE
    add_cell(DEFAULT_CODE.strip())

    add_cell(f"""
PIPELINE_NAME = "{pipeline_name}"
print(f"-----------Starting pipeline: {{PIPELINE_NAME}}-----------")
""".strip())

    # Blocks
    for block_id in execution_order:
        block = block_map[block_id]
        block_type = block["type"]
        upstream = block.get("upstream_blocks", [])

        code = read_block_code(pipeline_dir, block_id)

        inputs = ", ".join(f"{u}_out" for u in upstream)
        output_assignment = f"{block_id}_out = " if block_type != "data_exporter" else ""
        
        cell=f"""
        # MAGIC %md
        # MAGIC **Mage block:** {block_id}
        # MAGIC **Type:** {block_type}
        """.strip()

        cell1 = f"""

{code}
""".strip()

        # Function calls
        functions = extract_all_function_names(code)
        calls = "\n".join(f"{fn}" for fn in functions)

        cell2 = f"""
        {output_assignment}{calls}({inputs})
         """.strip()

        add_cell(cell)
        add_cell(cell1)
        add_cell(cell2)


    # Footer
    add_cell("""
print(f"-----------Pipeline {PIPELINE_NAME} completed successfully-----------")
""".strip())

    # Write notebook
    notebook_path = os.path.join(OUTPUT_DIR, f"{pipeline_name}.py")
    with open(notebook_path, "w") as f:
        f.write("\n".join(lines))

    print(f"-----------Generated notebook: {notebook_path}-----------")


# -----------------------------
# PROCESS ALL PIPELINES
# -----------------------------
for pipeline_folder in os.listdir(PIPELINES_ROOT):
    pipeline_dir = os.path.join(PIPELINES_ROOT, pipeline_folder)

    if not os.path.isdir(pipeline_dir):
        continue

    metadata_path = os.path.join(pipeline_dir, METADATA_FILE_NAME)
    if not os.path.exists(metadata_path):
        print(f"**************Skipping (no metadata.yaml): {pipeline_folder}**************")
        continue

    try:
        generate_notebook(pipeline_dir)
    except Exception as e:
        print(f"**************Failed pipeline {pipeline_folder}: {e}**************")