<h3>Amendments Log</h3>
<table style="width:100%">
  <thead>
    <tr>
      <th style="text-align:left">Version</th>
      <th style="text-align:left">Amended By</th>
      <th style="text-align:left">Date</th>
      <th style="text-align:left">Description</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>1.0</td>
      <td>Gary Manley</td>
      <td>2025-11-30</td>
      <td>Initial Version</td>
    </tr>
  </tbody>
</table>

# Orchestrator

**Objective:** Master controller for the ETL pipeline.

**Function:** 
1. Connects to MotherDuck `pipeline_control` table.
2. Reads the active steps and execution order.
3. Uses `papermill` to execute the child notebooks (`ingest`, `process`) in sequence.

In [None]:
# 1. SETUP & IMPORTS
import duckdb
import pandas as pd
import papermill as pm
import os
import sys
import time
from datetime import datetime
from dotenv import load_dotenv

# Using forward slashes for path safety on Windows/Linux compatibility
vLocalEnvPath = r"C:/Users/garym/Documents/GitHub/MovieReleases/.env"

if os.path.exists(vLocalEnvPath):
    # Local Mode: Load from specific file
    load_dotenv(dotenv_path=vLocalEnvPath)
    print(f"Loaded local environment from {vLocalEnvPath}")
else:
    # CI/CD Mode (GitHub Actions): Secrets are already in environment vars
    # We SKIP load_dotenv() to avoid AssertionError when running via pipe (| python)
    print("Local .env not found. Assuming CI/CD environment (Secrets already loaded).")

vMdToken = os.getenv("MOTHERDUCK_TOKEN")
if not vMdToken: raise RuntimeError("MOTHERDUCK_TOKEN missing")

print(f"--- STARTING PIPELINE AT {datetime.now()} ---")

In [None]:
# PARAMETERS / CONSTANTS
cNotebookName = "orchestrate_pipeline.ipynb"

## 2. Fetch Schedule

In [None]:
try:
    print("Connecting to MotherDuck to fetch schedule...")
    vCon = duckdb.connect(f"md:?motherduck_token={vMdToken}")
    
    # Read the control table
    # Ensure the table exists first to avoid crashes on fresh runs
    try:
        vSql = """
            SELECT step_id, notebook_path, description 
            FROM MovieReleases.main.pipeline_control 
            WHERE is_active = TRUE 
            ORDER BY step_id ASC
        """
        dfSchedule = vCon.sql(vSql).df()
    except Exception as e:
        print("Pipeline Control table not found. Please run setup SQL.")
        dfSchedule = pd.DataFrame()

    vCon.close()
    
    if dfSchedule.empty:
        print("No active steps found. Exiting.")
    else:
        print(f"Found {len(dfSchedule)} steps to execute.")

except Exception as e:
    raise RuntimeError(f"Failed to fetch pipeline schedule: {e}")

## 3. Execute Pipeline Loop

In [None]:
vHasErrors = False

if not dfSchedule.empty:
    for vIndex, vRow in dfSchedule.iterrows():
        vStepId = vRow['step_id']
        vNotebook = vRow['notebook_path']
        vDesc = vRow['description']
        
        print(f"\n>>> EXECUTION STEP {vStepId}: {vNotebook}")
        print(f"    Description: {vDesc}")
        
        # Define output path for logs
        vLogDir = "logs"
        os.makedirs(vLogDir, exist_ok=True)
        vOutputNotebook = os.path.join(vLogDir, f"out_{vNotebook}")
        
        try:
            vStart = time.time()
            
            # PAPERMILL: Runs the notebook
            pm.execute_notebook(
                input_path=vNotebook,
                output_path=vOutputNotebook,
                parameters=dict(vResetTable=False),
                kernel_name='python3',
                progress_bar=False, 
                stdout_file=sys.stdout
            )
            
            vEnd = time.time()
            print(f"    [SUCCESS] Step {vStepId} completed in {round(vEnd - vStart, 2)}s")
            
        except Exception as e:
            print(f"    [FAILURE] Step {vStepId} failed: {e}")
            print(f"    Check output notebook: {vOutputNotebook}")
            vHasErrors = True
            break

if vHasErrors:
    raise RuntimeError("Pipeline Failed")
else:
    print("\n--- PIPELINE SUCCESS ---")