# Orchestrator

**Objective:** Master controller for the ETL pipeline.

**Function:** 
1. Connects to MotherDuck `pipeline_control` table.
2. Reads the active steps and execution order.
3. Uses `papermill` to execute the child notebooks (`ingest`, `process`) in sequence.

In [None]:
# 1. IMPORTS
import duckdb
import pandas as pd
import papermill as pm
import os
import sys
import time
from datetime import datetime
from dotenv import load_dotenv

In [None]:
# 2. SETUP
# Using forward slashes for path safety on Windows/Linux compatibility
vLocalEnvPath = r"C:/Users/garym/Documents/GitHub/MovieReleases/.env"

if os.path.exists(vLocalEnvPath):
    # Local Mode: Load from specific file
    load_dotenv(dotenv_path=vLocalEnvPath)
    print(f"Loaded local environment from {vLocalEnvPath}")
else:
    # CI/CD Mode (GitHub Actions): Secrets are already in environment vars
    # We SKIP load_dotenv() to avoid AssertionError when running via pipe (| python)
    print("Local .env not found. Assuming CI/CD environment (Secrets already loaded).")

vMdToken = os.getenv("MOTHERDUCK_TOKEN")
if not vMdToken: raise RuntimeError("MOTHERDUCK_TOKEN missing")

print(f"--- STARTING PIPELINE AT {datetime.now()} ---")

## 3. Fetch Schedule

In [None]:
try:
    print("Connecting to MotherDuck to fetch schedule...")
    con = duckdb.connect(f"md:?motherduck_token={vMdToken}")
    
    # Read the control table
    # Ensure the table exists first to avoid crashes on fresh runs
    try:
        vSql = """
            SELECT step_id, notebook_path, description 
            FROM MovieReleases.main.pipeline_control 
            WHERE is_active = TRUE 
            ORDER BY step_id ASC
        """
        dfSchedule = con.sql(vSql).df()
    except Exception as e:
        print("Pipeline Control table not found. Please run setup SQL.")
        dfSchedule = pd.DataFrame()

    con.close()
    
    if dfSchedule.empty:
        print("No active steps found. Exiting.")
    else:
        print(f"Found {len(dfSchedule)} steps to execute.")

except Exception as e:
    raise RuntimeError(f"Failed to fetch pipeline schedule: {e}")

## 4. Execute Pipeline Loop

In [None]:
vHasErrors = False

if not dfSchedule.empty:
    for index, row in dfSchedule.iterrows():
        vStepId = row['step_id']
        vNotebook = row['notebook_path']
        vDesc = row['description']
        
        print(f"\n>>> EXECUTION STEP {vStepId}: {vNotebook}")
        print(f"    Description: {vDesc}")
        
        # Define output path for logs
        vLogDir = "logs"
        os.makedirs(vLogDir, exist_ok=True)
        vOutputNotebook = os.path.join(vLogDir, f"out_{vNotebook}")
        
        try:
            t_start = time.time()
            
            # PAPERMILL: Runs the notebook
            pm.execute_notebook(
                input_path=vNotebook,
                output_path=vOutputNotebook,
                parameters=dict(vResetTable=False),
                kernel_name='python3',
                progress_bar=False, 
                stdout_file=sys.stdout
            )
            
            t_end = time.time()
            print(f"    [SUCCESS] Step {vStepId} completed in {round(t_end - t_start, 2)}s")
            
        except Exception as e:
            print(f"    [FAILURE] Step {vStepId} failed: {e}")
            print(f"    Check output notebook: {vOutputNotebook}")
            vHasErrors = True
            break

if vHasErrors:
    raise RuntimeError("Pipeline Failed")
else:
    print("\n--- PIPELINE SUCCESS ---")