<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/CLAUDE4DOT6_ORCHESTRATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://platform.claude.com/docs/en/about-claude/models/overview

https://platform.claude.com/dashboard

In [None]:
!pip install anthropic -q
!pip install colab-env -q

In [None]:
import colab_env
colab_env.RELOAD()

## CASE1

In [None]:
import os
import json
import time
import anthropic
from concurrent.futures import ThreadPoolExecutor

# 1. SETUP CLIENT
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
    raise ValueError("ANTHROPIC_API_KEY environment variable not set.")

client = anthropic.Anthropic(api_key=api_key)
MODEL_NAME = "claude-opus-4-6"

class Orchestrator:
    def __init__(self):
        self.task_store = {}
        self.results = {}
        self.start_time = None
        self.total_tasks = 0
        self.token_usage = {'input': 0, 'output': 0}
        self.completion_history = []

    def get_plan(self, goal):
        """Phase 1: The Brain - Simple JSON output approach."""
        print(f"üß† Brain: Planning via {MODEL_NAME}...")

        try:
            # Simple approach: Just ask for JSON without complex structured output
            message = client.messages.create(
                model=MODEL_NAME,
                max_tokens=4000,
                system="""You are an AI Architect. Plan the project into a logical Directed Acyclic Graph (DAG).

                Return ONLY valid JSON with this exact structure:
                {
                    "tasks": [
                        {
                            "id": "task1",
                            "description": "Detailed task description",
                            "dependencies": [],  # Array of task IDs this depends on
                            "agent_role": "Role name for this task"
                        }
                    ]
                }

                Rules:
                1. Use simple task IDs like task1, task2, task3
                2. Make dependencies clear and logical
                3. Assign appropriate agent roles
                4. Ensure it's a valid DAG (no circular dependencies)
                5. Include 3-6 tasks for complex projects""",
                messages=[{"role": "user", "content": f"{goal}\n\nReturn ONLY the JSON, no other text."}]
            )

            # Track token usage
            if hasattr(message, 'usage'):
                self.token_usage['input'] += message.usage.input_tokens
                self.token_usage['output'] += message.usage.output_tokens

            # Extract and parse JSON
            import re
            text = message.content[0].text

            # Try to find JSON in the response
            json_match = re.search(r'\{[\s\S]*\}', text)
            if json_match:
                json_str = json_match.group()
                plan = json.loads(json_str)
            else:
                # If no JSON found, try to parse the whole text
                plan = json.loads(text)

            # Validate the plan structure
            if 'tasks' not in plan:
                raise ValueError("Response missing 'tasks' key")

            for task in plan['tasks']:
                if not all(key in task for key in ['id', 'description', 'dependencies', 'agent_role']):
                    raise ValueError(f"Task {task.get('id', 'unknown')} missing required fields")

                task['status'] = 'PENDING'
                self.task_store[task['id']] = task

            self.total_tasks = len(self.task_store)
            print(f"‚úì Planned {self.total_tasks} tasks")

            # Validate DAG has no cycles
            self.validate_dag()

            return plan

        except Exception as e:
            print(f"‚ùå Planning Error: {e}")
            if 'message' in locals():
                print("Content received:", message.content[0].text[:500])
            raise

    def validate_dag(self):
        """Validate that the DAG has no circular dependencies."""
        visited = set()
        recursion_stack = set()

        def has_cycle(task_id):
            visited.add(task_id)
            recursion_stack.add(task_id)

            task = self.task_store[task_id]
            for dep in task['dependencies']:
                if dep not in self.task_store:
                    raise ValueError(f"Dependency {dep} not found in tasks")
                if dep not in visited:
                    if has_cycle(dep):
                        return True
                elif dep in recursion_stack:
                    return True

            recursion_stack.remove(task_id)
            return False

        for task_id in self.task_store:
            if task_id not in visited:
                if has_cycle(task_id):
                    raise ValueError(f"Circular dependency detected involving task {task_id}")

        print("‚úì DAG validation passed (no circular dependencies)")

    def execute_task(self, task_id, retries=2):
        """Phase 2: The Worker - With retry logic and completion checks."""
        task = self.task_store[task_id]
        print(f"üöÄ Dispatching [{task['agent_role']}]: {task_id}")

        # Build context from dependencies
        context_parts = []
        for dep in task['dependencies']:
            if dep in self.results:
                # Truncate long dependency results to save tokens
                dep_content = self.results[dep]
                if len(dep_content) > 2000:
                    dep_content = dep_content[:2000] + "...\n[Content truncated for token efficiency]"
                context_parts.append(f"=== Result from {dep} ===\n{dep_content}")

        context_text = "\n\n".join(context_parts) if context_parts else "No dependencies"

        # Adjust tokens based on role
        if task['agent_role'] in ['Technical Writer', 'Content Writer', 'Blog Writer']:
            max_tokens = 4000  # More tokens for writing tasks
        else:
            max_tokens = 3000

        for attempt in range(retries):
            try:
                response = client.messages.create(
                    model=MODEL_NAME,
                    max_tokens=max_tokens,
                    temperature=0.7,
                    system=f"""You are a specialized {task['agent_role']}.

                    IMPORTANT INSTRUCTIONS:
                    1. Provide COMPLETE, self-contained output
                    2. Do NOT cut off mid-sentence or mid-thought
                    3. Structure your response to be complete within token limits
                    4. If discussing multiple sections, ensure ALL are completed
                    5. End with proper conclusion/closure""",
                    messages=[{
                        "role": "user",
                        "content": f"""TASK: {task['description']}

DEPENDENCY CONTEXT:
{context_text}

IMPORTANT: Provide a COMPLETE, FINISHED response. Ensure:
- No unfinished sentences
- No "[section continues]" placeholders
- All promised content is delivered
- Proper ending/closure

Your complete response:"""
                    }]
                )

                # Track token usage
                if hasattr(response, 'usage'):
                    self.token_usage['input'] += response.usage.input_tokens
                    self.token_usage['output'] += response.usage.output_tokens

                result = response.content[0].text

                # Quality check
                quality_ok = self.quality_check(task_id, result)
                if not quality_ok and attempt < retries - 1:
                    print(f"‚ö†Ô∏è Quality check failed for {task_id}, retrying...")
                    continue

                self.results[task_id] = result
                self.task_store[task_id]['status'] = 'COMPLETED'
                self.completion_history.append({
                    'task_id': task_id,
                    'role': task['agent_role'],
                    'timestamp': time.time(),
                    'quality_check': quality_ok
                })

                print(f"‚úÖ Completed: {task_id}")
                return

            except Exception as e:
                if attempt < retries - 1:
                    wait_time = 2 ** attempt  # Exponential backoff
                    print(f"‚ö†Ô∏è Task {task_id} failed (attempt {attempt + 1}/{retries}): {e}. Retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"‚ùå Task {task_id} permanently failed after {retries} attempts: {e}")
                    self.task_store[task_id]['status'] = 'FAILED'
                    self.task_store[task_id]['error'] = str(e)
                    # Store error result for dependencies
                    self.results[task_id] = f"[Task failed: {e}]"

    def quality_check(self, task_id, content):
        """Basic quality checks on generated content."""
        task = self.task_store[task_id]

        issues = []

        # Check for minimum length
        if len(content.strip()) < 100:
            issues.append("Content too short (<100 chars)")

        # Check for cut-off sentences (common issue)
        trimmed_content = content.strip()
        if trimmed_content:
            # Check last 100 characters for proper ending
            last_part = trimmed_content[-100:]
            if (not any(punct in last_part[-10:] for punct in '.!?') and
                not any(ending in last_part.lower() for ending in ['conclusion', 'summary', 'thank you', 'end', 'finish'])):
                issues.append("Content may be incomplete (no proper ending)")

        # Check for obvious placeholders
        if any(placeholder in content.lower() for placeholder in
               ['todo:', 'placeholder', '[insert', '...', 'etc.', 'and more']):
            if content.lower().count('...') > 3:  # Multiple ellipses
                issues.append("Content contains many placeholders/ellipses")

        # Role-specific checks
        if task['agent_role'] in ['Technical Writer', 'Blog Writer']:
            if len(content) < 500:
                issues.append("Writing task too short (<500 chars)")

        if issues:
            print(f"‚ö†Ô∏è  Quality issues for {task_id}: {', '.join(issues)}")
            return False
        return True

    def show_progress_bar(self):
        """Visual progress bar."""
        completed = sum(1 for t in self.task_store.values() if t['status'] == 'COMPLETED')
        total = self.total_tasks
        if total == 0:
            return

        bar_length = 40
        filled_length = int(bar_length * completed // total)
        bar = '‚ñà' * filled_length + '‚ñë' * (bar_length - filled_length)
        percentage = (completed / total) * 100

        elapsed = time.time() - self.start_time if self.start_time else 0
        print(f"\rProgress: |{bar}| {completed}/{total} ({percentage:.1f}%) | ‚è±Ô∏è {elapsed:.1f}s", end='', flush=True)

    def print_progress(self):
        """Display execution progress."""
        completed = sum(1 for t in self.task_store.values() if t['status'] == 'COMPLETED')
        total = self.total_tasks
        if total == 0:
            return

        elapsed = time.time() - self.start_time if self.start_time else 0
        percentage = (completed / total) * 100

        # Show visual progress bar
        self.show_progress_bar()

        # Show detailed status (less frequently)
        if int(elapsed) % 6 == 0:  # Every 6 seconds
            print()  # New line after progress bar

            status_counts = {}
            running_tasks = []
            for task_id, task in self.task_store.items():
                status = task['status']
                status_counts[status] = status_counts.get(status, 0) + 1
                if status == 'RUNNING':
                    running_tasks.append(task_id)

            if running_tasks:
                print(f"   Active: {', '.join(running_tasks)}")

    def estimate_cost(self):
        """Estimate API costs."""
        # Claude 4.6 pricing (approximate, check current rates)
        input_cost_per_million = 75.00  # $75 per million input tokens
        output_cost_per_million = 375.00  # $375 per million output tokens

        input_cost = (self.token_usage['input'] / 1_000_000) * input_cost_per_million
        output_cost = (self.token_usage['output'] / 1_000_000) * output_cost_per_million
        total_cost = input_cost + output_cost

        print(f"\nüí∞ Cost Estimation:")
        print(f"   Input tokens: {self.token_usage['input']:,} ‚âà ${input_cost:.4f}")
        print(f"   Output tokens: {self.token_usage['output']:,} ‚âà ${output_cost:.4f}")
        print(f"   Total estimated: ${total_cost:.4f}")

        return total_cost

    def run_orchestration(self, timeout_seconds=600):
        """Phase 3: The Manager loop with timeout."""
        print("‚öôÔ∏è Orchestrator: Starting execution loop...")
        self.start_time = time.time()

        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = {}

            while True:
                current_time = time.time()

                # Check timeout
                if current_time - self.start_time > timeout_seconds:
                    print(f"\n‚è∞ Timeout after {timeout_seconds} seconds")
                    break

                # Show progress
                self.print_progress()

                # Find ready tasks (PENDING with all dependencies COMPLETED)
                ready_tasks = []
                for task_id, task in self.task_store.items():
                    if task['status'] == 'PENDING':
                        # Check all dependencies are COMPLETED
                        deps_ready = True
                        for dep in task['dependencies']:
                            if self.task_store.get(dep, {}).get('status') != 'COMPLETED':
                                deps_ready = False
                                break

                        if deps_ready:
                            ready_tasks.append(task_id)

                # Submit ready tasks
                for task_id in ready_tasks:
                    if task_id not in futures or futures[task_id].done():
                        self.task_store[task_id]['status'] = 'RUNNING'
                        futures[task_id] = executor.submit(self.execute_task, task_id)

                # Check if all tasks are done
                all_done = all(
                    t['status'] in ['COMPLETED', 'FAILED']
                    for t in self.task_store.values()
                )

                if all_done:
                    print("\n\nüéØ All tasks completed!")
                    break

                time.sleep(0.5)  # Small delay to prevent busy waiting

        # Final progress update
        print()  # Clear progress bar line
        self.show_progress_bar()
        print("\n" + "="*50)
        return self.results

if __name__ == "__main__":
    orch = Orchestrator()

    # Example goal
    user_goal = "Create a 3-step technical blog post about Kubernetes security: 1. Research, 2. Write, 3. Review."

    try:
        print("üöÄ Starting DAG Orchestrator")
        print(f"Goal: {user_goal}")
        print("-" * 50)

        plan = orch.get_plan(user_goal)

        # Show the plan
        print("\nüìã Task Plan:")
        for i, task in enumerate(plan['tasks'], 1):
            deps = ', '.join(task['dependencies']) if task['dependencies'] else 'None'
            print(f"  {i}. {task['id']}: {task['description'][:80]}...")
            print(f"     Role: {task['agent_role']}, Depends on: {deps}")

        print("\n" + "="*50)
        print("Starting execution...")
        results = orch.run_orchestration()

        print("\n" + "="*50 + "\nüèÅ BUILD COMPLETE\n" + "="*50)

        # Show results summary
        successful = sum(1 for t in orch.task_store.values() if t['status'] == 'COMPLETED')
        failed = sum(1 for t in orch.task_store.values() if t['status'] == 'FAILED')
        print(f"\nüìà Summary: {successful} successful, {failed} failed")

        # Estimate costs
        orch.estimate_cost()

        # Save results to files
        os.makedirs("output", exist_ok=True)

        # Display results
        print("\nüìÇ Output Files:")
        for tid, content in results.items():
            status = orch.task_store[tid]['status']
            role = orch.task_store[tid]['agent_role']

            # Clean role for filename
            clean_role = role.replace(' ', '_').replace('/', '_')
            filename = f"output/{tid}_{clean_role}.txt"

            # Save to file
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(f"Task: {tid}\n")
                f.write(f"Role: {role}\n")
                f.write(f"Status: {status}\n")
                f.write(f"Generated: {time.ctime()}\n")
                f.write("="*60 + "\n\n")
                f.write(content)

            print(f"  {tid}: {filename}")

        # Show completion timeline
        print("\n‚è±Ô∏è Completion Timeline:")
        for entry in orch.completion_history:
            elapsed = entry['timestamp'] - orch.start_time
            quality = "‚úì" if entry['quality_check'] else "‚ö†"
            print(f"  {elapsed:6.1f}s | {quality} {entry['task_id']} ({entry['role']})")

    except Exception as e:
        print(f"\n‚ùå FATAL ERROR: {e}")
        import traceback
        traceback.print_exc()

In [17]:
print("\n" + "="*50 + "\nüèÅ BUILD COMPLETE\n" + "="*50)

# Show results summary
successful = sum(1 for t in orch.task_store.values() if t['status'] == 'COMPLETED')
failed = sum(1 for t in orch.task_store.values() if t['status'] == 'FAILED')
print(f"\nüìà Summary: {successful} successful, {failed} failed")

# Estimate costs
orch.estimate_cost()

# Save results to files
os.makedirs("output", exist_ok=True)

# Display results
print("\nüìÇ Output Files:")
for tid, content in results.items():
    status = orch.task_store[tid]['status']
    role = orch.task_store[tid]['agent_role']

    # Clean role for filename
    clean_role = role.replace(' ', '_').replace('/', '_')
    filename = f"output/{tid}_{clean_role}.txt"

    # Save to file
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"Task: {tid}\n")
        f.write(f"Role: {role}\n")
        f.write(f"Status: {status}\n")
        f.write(f"Generated: {time.ctime()}\n")
        f.write("="*60 + "\n\n")
        f.write(content)

    print(f"  {tid}: {filename}")

# Show completion timeline
print("\n‚è±Ô∏è Completion Timeline:")
for entry in orch.completion_history:
    elapsed = entry['timestamp'] - orch.start_time
    quality = "‚úì" if entry['quality_check'] else "‚ö†"
    print(f"  {elapsed:6.1f}s | {quality} {entry['task_id']} ({entry['role']})")


üèÅ BUILD COMPLETE

üìà Summary: 3 successful, 0 failed

üí∞ Cost Estimation:
   Input tokens: 3,570 ‚âà $0.2677
   Output tokens: 20,334 ‚âà $7.6253
   Total estimated: $7.8930

üìÇ Output Files:
  task1: output/task1_Technical_Researcher.txt
  task2: output/task2_Technical_Writer.txt
  task3: output/task3_Technical_Editor.txt

‚è±Ô∏è Completion Timeline:
   121.2s | ‚ö† task1 (Technical Researcher)
   251.6s | ‚ö† task2 (Technical Writer)
   374.9s | ‚ö† task3 (Technical Editor)


## CASE2

In [34]:
import os
import json
import time
import numpy as np
import hashlib
from typing import Dict, List, Optional, Tuple, Any
import anthropic
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from datetime import datetime

# 1. SETUP CLIENT
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
    raise ValueError("ANTHROPIC_API_KEY environment variable not set.")

client = anthropic.Anthropic(api_key=api_key)
MODEL_NAME = "claude-opus-4-6"

# H2E Framework Components
@dataclass
class ExpertIntentVector:
    """NEZ (Normalized Expert Zone): Encoded expert intent"""
    task_id: str
    role: str
    vector: np.ndarray
    timestamp: datetime = field(default_factory=datetime.now)
    description: str = ""
    gold_standard_examples: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict:
        """Convert to JSON-serializable dictionary"""
        return {
            "task_id": self.task_id,
            "role": self.role,
            "vector": self.vector.tolist(),
            "timestamp": self.timestamp.isoformat(),
            "description": self.description,
            "gold_standard_examples": self.gold_standard_examples
        }

@dataclass
class AlignmentScore:
    """SROI (Semantic ROI): Alignment measurement"""
    task_id: str
    score: float
    threshold_required: float
    passed: bool
    vector_distance: float
    explanation: str = ""

    def to_dict(self) -> Dict:
        """Convert to JSON-serializable dictionary"""
        return {
            "task_id": self.task_id,
            "score": float(self.score),
            "threshold_required": float(self.threshold_required),
            "passed": bool(self.passed),
            "vector_distance": float(self.vector_distance),
            "explanation": self.explanation
        }

class H2EAccountabilityEngine:
    """H2E-inspired accountability framework"""

    def __init__(self, calibration_mode: bool = False):
      self.expert_vectors: Dict[str, ExpertIntentVector] = {}  # NEZ: Stores expert intent vectors
      self.alignment_history: List[AlignmentScore] = []  # SROI: Tracks all alignment scores
      self.calibration_mode = calibration_mode  # Controls if we're in calibration or production mode
      self.calibration_data = []  # Collects data for threshold calibration

    def capture_expert_intent(self, task_id: str, role: str,
                             description: str, examples: List[str]) -> ExpertIntentVector:
        """NEZ: Capture expert intent as encoded vector"""
        intent_text = f"{role}: {description}. Examples: {' '.join(examples[:3])}"
        vector = self._text_to_vector_enhanced(intent_text)

        expert_vector = ExpertIntentVector(
            task_id=task_id,
            role=role,
            vector=vector,
            description=description,
            gold_standard_examples=examples
        )

        self.expert_vectors[task_id] = expert_vector
        print(f"üìä H2E/NEZ: Captured expert intent for {task_id} ({role})")
        return expert_vector

    def _text_to_vector_enhanced(self, text: str) -> np.ndarray:
        """Enhanced vector encoding with better semantic features"""
        text = text.lower().strip()

        # Split into words and remove very short words
        words = [word for word in text.split() if len(word) > 2]

        # Create 768-dimensional vector (like common embedding sizes)
        vector = np.zeros(768)

        # Use multiple hash functions for better distribution
        for i, word in enumerate(words[:100]):  # Limit to first 100 words
            # Use different hash seeds for better coverage
            for seed in [0, 1, 2]:
                hash_val = hash(f"{word}_{seed}") % 10000
                pos = (hash_val + seed * 100) % 768
                val = (hash_val % 100) / 100.0
                vector[pos] += val

        # Add position weighting (earlier words more important)
        for i, word in enumerate(words[:50]):
            pos_weight = 1.0 - (i / 100.0)  # Decrease weight with position
            hash_val = hash(word) % 10000
            pos = hash_val % 768
            vector[pos] += pos_weight * 0.5

        # Normalize with L2 norm
        norm = np.linalg.norm(vector)
        if norm > 0:
            vector = vector / norm
        else:
            # Fallback to uniform distribution
            vector = np.ones(768) / np.sqrt(768)

        return vector

    def calculate_sroi(self, task_id: str, generated_output: str,
                      task_description: str = "") -> AlignmentScore:
        """SROI: Calculate alignment with expert intent"""

        if task_id not in self.expert_vectors:
            # Create expert vector if not exists
            self.capture_expert_intent(
                task_id=task_id,
                role="Unknown",
                description=task_description or f"Task {task_id}",
                examples=[generated_output[:500]]
            )

        expert_vector = self.expert_vectors[task_id]
        output_vector = self._text_to_vector_enhanced(generated_output)

        # Calculate cosine similarity with smoothing
        similarity = np.dot(expert_vector.vector, output_vector)

        # Apply sigmoid-like function for better distribution
        similarity = 1 / (1 + np.exp(-10 * (similarity - 0.5)))

        # Ensure bounds
        similarity = max(0.1, min(0.9, similarity))

        # Get threshold (lower in calibration mode)
        threshold = self._get_required_threshold(expert_vector.role, task_description, similarity)

        # In calibration mode, collect data without failing
        if self.calibration_mode:
            passed = True
            self.calibration_data.append({
                "task_id": task_id,
                "role": expert_vector.role,
                "score": similarity,
                "description": task_description
            })
        else:
            passed = similarity >= threshold

        # Calculate vector distance
        distance = np.linalg.norm(expert_vector.vector - output_vector)

        # Generate explanation based on score
        if similarity >= 0.7:
            explanation = "Excellent alignment with expert intent"
        elif similarity >= 0.55:
            explanation = "Good alignment, minor deviations"
        elif similarity >= 0.4:
            explanation = "Moderate alignment, some semantic drift"
        elif similarity >= 0.25:
            explanation = "Poor alignment - significant semantic drift"
        else:
            explanation = "Very poor alignment - major semantic drift"

        score = AlignmentScore(
            task_id=task_id,
            score=similarity,
            threshold_required=threshold,
            passed=passed,
            vector_distance=distance,
            explanation=explanation
        )

        self.alignment_history.append(score)
        return score

    def _get_required_threshold(self, role: str, description: str, current_score: float = None) -> float:
        """Get adaptive threshold based on role, risk, and calibration data"""



        # Base thresholds calibrated from previous runs
        base_thresholds = {
            "Research Analyst": 0.532,
            "Technical Writer": 0.350,
            "Technical Reviewer": 0.656,
            "Technical Researcher": 0.45,
            "Writer": 0.50,
            "Editor": 0.55,
            "Reviewer": 0.55
        }

        # Default if role not found
        threshold = base_thresholds.get(role, 0.50)

        # Apply risk multiplier for security content
        description_lower = description.lower()
        security_keywords = ["security", "vulnerability", "risk", "attack", "breach", "hack"]
        if any(keyword in description_lower for keyword in security_keywords):
            threshold *= 1.15  # 15% higher for security
            threshold = min(threshold, 0.75)

        # If we have calibration data, adjust based on historical performance
        if self.calibration_data and current_score is not None:
            # Find similar tasks in calibration data
            similar_scores = [
                d["score"] for d in self.calibration_data
                if d["role"] == role and abs(d["score"] - current_score) < 0.2
            ]
            if similar_scores:
                avg_similar = np.mean(similar_scores)
                # Adjust threshold towards historical average
                threshold = threshold * 0.7 + avg_similar * 0.3

        return round(threshold, 3)

    def analyze_calibration_data(self):
        """Analyze calibration data to recommend thresholds"""
        if not self.calibration_data:
            print("‚ö†Ô∏è No calibration data available")
            return

        # Group by role
        role_data = {}
        for entry in self.calibration_data:
            role = entry["role"]
            if role not in role_data:
                role_data[role] = []
            role_data[role].append(entry["score"])

        print("\nüîß H2E Calibration Analysis:")
        print("="*40)

        recommendations = {}
        for role, scores in role_data.items():
            avg_score = np.mean(scores)
            std_score = np.std(scores) if len(scores) > 1 else 0
            min_score = min(scores)
            max_score = max(scores)

            # Recommended threshold: 1 standard deviation below mean, but not too low
            recommended = max(0.35, avg_score - std_score/2)

            recommendations[role] = recommended

            print(f"\n  {role}:")
            print(f"    Samples: {len(scores)}")
            print(f"    Average: {avg_score:.3f}")
            print(f"    Range: {min_score:.3f} - {max_score:.3f}")
            print(f"    Recommended threshold: {recommended:.3f}")

        return recommendations

    def generate_accountability_report(self) -> Dict[str, Any]:
        """Generate H2E accountability report"""
        if not self.alignment_history:
            return {"status": "No alignment data available"}

        scores = [s.score for s in self.alignment_history]
        avg_score = np.mean(scores) if scores else 0
        std_score = np.std(scores) if len(scores) > 1 else 0

        # Get unique final scores (not retries)
        final_scores = {}
        for score in reversed(self.alignment_history):
            if score.task_id not in final_scores:
                final_scores[score.task_id] = score

        pass_rate = np.mean([1 if s.passed else 0 for s in final_scores.values()])

        report = {
            "h2e_framework_report": {
                "timestamp": datetime.now().isoformat(),
                "calibration_mode": self.calibration_mode,
                "total_tasks_evaluated": len(self.alignment_history),
                "unique_tasks": len(final_scores),
                "average_sroi_score": float(round(avg_score, 4)),
                "sroi_std_dev": float(round(std_score, 4)),
                "alignment_pass_rate": f"{pass_rate*100:.1f}%",
                "expert_vectors_captured": len(self.expert_vectors),
                "recommended_threshold": float(round(max(0.35, avg_score - std_score/2), 3)),
                "task_breakdown": []
            }
        }

        # Add task breakdown
        for task_id, score in final_scores.items():
            report["h2e_framework_report"]["task_breakdown"].append({
                "task_id": task_id,
                "role": self.expert_vectors.get(task_id, ExpertIntentVector("", "", np.zeros(1))).role,
                "sroi_score": float(round(score.score, 3)),
                "required_threshold": float(score.threshold_required),
                "passed": bool(score.passed),
                "explanation": score.explanation
            })

        return report

    def save_report(self, filename: str = "h2e_accountability_report.json"):
        """Save H2E report to file"""
        report = self.generate_accountability_report()

        class H2EJSONEncoder(json.JSONEncoder):
            def default(self, obj):
                if isinstance(obj, np.ndarray):
                    return obj.tolist()
                if isinstance(obj, datetime):
                    return obj.isoformat()
                if isinstance(obj, (ExpertIntentVector, AlignmentScore)):
                    return obj.to_dict()
                return super().default(obj)

        os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else ".", exist_ok=True)

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, cls=H2EJSONEncoder)

        print(f"‚úì H2E report saved to {filename}")
        return report

class Orchestrator:
    def __init__(self, calibration_mode: bool = False):
        self.task_store = {}
        self.results = {}
        self.start_time = None
        self.total_tasks = 0
        self.token_usage = {'input': 0, 'output': 0}
        self.completion_history = []

        # H2E Accountability Engine
        self.h2e = H2EAccountabilityEngine(calibration_mode=calibration_mode)
        self.calibration_mode = calibration_mode

        # Enhanced expert templates based on analysis
        self.expert_templates = {
            "Research Analyst": {
                "description": "Comprehensive research with structured findings",
                "gold_standard_examples": [
                    "Research Summary: This analysis examines [topic] using [methods]. Key findings include: 1) [finding] 2) [finding] 3) [finding]",
                    "Methodology: Data was collected from [sources] and analyzed using [techniques]",
                    "Conclusion: Based on the research, we recommend [recommendations]"
                ]
            },
            "Technical Writer": {
                "description": "Clear technical documentation with examples",
                "gold_standard_examples": [
                    "Introduction: Overview of [topic] and its importance",
                    "Implementation: Step-by-step instructions with code examples: ```python\n# Example code\nprint('Hello World')\n```",
                    "Best Practices: Key recommendations for successful implementation"
                ]
            },
            "Technical Reviewer": {
                "description": "Thorough technical review with specific feedback",
                "gold_standard_examples": [
                    "Overall Assessment: The document is [assessment] with strengths in [areas] and areas for improvement in [areas]",
                    "Technical Accuracy: Verified [claims] and validated [examples]",
                    "Suggestions: Recommend [specific improvements] for better clarity and accuracy"
                ]
            },
            "Technical Researcher": {
                "description": "In-depth technical research",
                "gold_standard_examples": [
                    "Research Summary: This analysis examines [topic] using [methods]. Key findings include: 1) [finding] 2) [finding] 3) [finding]",
                    "Methodology: Data was collected from [sources] and analyzed using [techniques]",
                    "Conclusion: Based on the research, we recommend [recommendations]"
                ]
            },
            "Writer": {
                "description": "Professional writing",
                "gold_standard_examples": [
                    "Introduction: Overview of [topic] and its importance",
                    "Implementation: Step-by-step instructions with code examples: ```python\n# Example code\nprint('Hello World')\n```",
                    "Best Practices: Key recommendations for successful implementation"
                ]
            },
            "Editor": {
                "description": "Editorial review",
                "gold_standard_examples": [
                    "Overall Assessment: The document is [assessment] with strengths in [areas] and areas for improvement in [areas]",
                    "Technical Accuracy: Verified [claims] and validated [examples]",
                    "Suggestions: Recommend [specific improvements] for better clarity and accuracy"
                ]
            }
        }

    def get_plan(self, goal):
        """Phase 1: Planning"""
        print(f"üß† Brain: Planning via {MODEL_NAME}...")

        try:
            message = client.messages.create(
                model=MODEL_NAME,
                max_tokens=4000,
                system="""You are an AI Architect. Plan the project into a logical Directed Acyclic Graph (DAG).

                Return ONLY valid JSON with this exact structure:
                {
                    "tasks": [
                        {
                            "id": "task1",
                            "description": "Detailed task description",
                            "dependencies": [],
                            "agent_role": "Role name"
                        }
                    ]
                }

                Rules:
                1. Use simple task IDs like task1, task2, task3
                2. Make dependencies clear and logical
                3. Assign agent roles that match the task (Research Analyst, Technical Writer, Technical Reviewer, etc.)
                4. Ensure it's a valid DAG (no circular dependencies)
                5. Include 3-5 tasks total""",
                messages=[{"role": "user", "content": f"{goal}\n\nReturn ONLY the JSON, no other text."}]
            )

            if hasattr(message, 'usage'):
                self.token_usage['input'] += message.usage.input_tokens
                self.token_usage['output'] += message.usage.output_tokens

            import re
            text = message.content[0].text

            json_match = re.search(r'\{[\s\S]*\}', text)
            if json_match:
                json_str = json_match.group()
                plan = json.loads(json_str)
            else:
                plan = json.loads(text)

            if 'tasks' not in plan:
                raise ValueError("Response missing 'tasks' key")

            for task in plan['tasks']:
                if not all(key in task for key in ['id', 'description', 'dependencies', 'agent_role']):
                    raise ValueError(f"Task {task.get('id', 'unknown')} missing required fields")

                task['status'] = 'PENDING'
                self.task_store[task['id']] = task

                role = task['agent_role']

                # Find matching template
                template_role = None
                for template_key in self.expert_templates.keys():
                    if template_key.lower() in role.lower():
                        template_role = template_key
                        break

                if template_role and template_role in self.expert_templates:
                    template = self.expert_templates[template_role]
                    self.h2e.capture_expert_intent(
                        task_id=task['id'],
                        role=role,
                        description=task['description'],
                        examples=template['gold_standard_examples']
                    )
                else:
                    # Use default template
                    self.h2e.capture_expert_intent(
                        task_id=task['id'],
                        role=role,
                        description=task['description'],
                        examples=[f"Professional output for {role} focusing on {task['description'][:50]}..."]
                    )

            self.total_tasks = len(self.task_store)
            print(f"‚úì Planned {self.total_tasks} tasks")

            self.validate_dag()

            return plan

        except Exception as e:
            print(f"‚ùå Planning Error: {e}")
            if 'message' in locals():
                print("Content received:", message.content[0].text[:500])
            raise

    def validate_dag(self):
        """Validate that the DAG has no circular dependencies."""
        visited = set()
        recursion_stack = set()

        def has_cycle(task_id):
            visited.add(task_id)
            recursion_stack.add(task_id)

            task = self.task_store[task_id]
            for dep in task['dependencies']:
                if dep not in self.task_store:
                    raise ValueError(f"Dependency {dep} not found in tasks")
                if dep not in visited:
                    if has_cycle(dep):
                        return True
                elif dep in recursion_stack:
                    return True

            recursion_stack.remove(task_id)
            return False

        for task_id in self.task_store:
            if task_id not in visited:
                if has_cycle(task_id):
                    raise ValueError(f"Circular dependency detected involving task {task_id}")

        print("‚úì DAG validation passed (no circular dependencies)")

    def execute_task(self, task_id, retries=2):
        """Phase 2: Task execution with H2E accountability"""
        task = self.task_store[task_id]
        print(f"üöÄ Dispatching [{task['agent_role']}]: {task_id}")

        # Build context from dependencies
        context_parts = []
        for dep in task['dependencies']:
            if dep in self.results:
                dep_content = self.results[dep]
                if len(dep_content) > 1200:
                    dep_content = dep_content[:1200] + "...\n[Content truncated for efficiency]"
                context_parts.append(f"=== Result from {dep} ===\n{dep_content}")

        context_text = "\n\n".join(context_parts) if context_parts else "No dependencies"

        # Adjust tokens based on role
        if 'Writer' in task['agent_role']:
            max_tokens = 4000
        else:
            max_tokens = 3000

        for attempt in range(retries):
            try:
                # Enhanced prompt for better alignment
                system_prompt = f"""You are a {task['agent_role']}. Your task is: {task['description']}

H2E ACCOUNTABILITY REQUIREMENTS:
1. Provide a COMPLETE, self-contained response
2. Use clear structure with headings/sections
3. Include specific examples or evidence
4. End with a proper conclusion or summary
5. Maintain professional tone and technical accuracy

OUTPUT FORMAT GUIDELINES:
- Start with a clear title or heading
- Use sections like Introduction, Main Content, Conclusion
- Include bullet points or numbered lists where helpful
- Add code examples in markdown format if relevant
- Ensure the response is comprehensive and addresses all aspects of the task"""

                user_prompt = f"""TASK DESCRIPTION:
{task['description']}

CONTEXT FROM DEPENDENCIES:
{context_text}

INSTRUCTIONS:
1. Provide a complete, well-structured response
2. Use appropriate formatting (headings, lists, etc.)
3. Include specific details and examples
4. Ensure technical accuracy
5. Conclude with a summary or key takeaways

Your professional response:"""

                response = client.messages.create(
                    model=MODEL_NAME,
                    max_tokens=max_tokens,
                    temperature=0.7,
                    system=system_prompt,
                    messages=[{
                        "role": "user",
                        "content": user_prompt
                    }]
                )

                if hasattr(response, 'usage'):
                    self.token_usage['input'] += response.usage.input_tokens
                    self.token_usage['output'] += response.usage.output_tokens

                result = response.content[0].text

                # H2E: Calculate alignment score
                alignment_score = self.h2e.calculate_sroi(
                    task_id=task_id,
                    generated_output=result,
                    task_description=task['description']
                )

                print(f"   H2E/SROI: Alignment score: {alignment_score.score:.3f} " +
                      f"(Required: {alignment_score.threshold_required}) - {alignment_score.explanation}")

                # Quality check
                quality_ok = self.quality_check(task_id, result, alignment_score)

                # In calibration mode, don't retry for alignment failures
                if not self.calibration_mode and not alignment_score.passed and attempt < retries - 1:
                    print(f"‚ö†Ô∏è H2E alignment failed for {task_id} (score: {alignment_score.score:.3f}), retrying...")
                    continue

                if not quality_ok and attempt < retries - 1:
                    print(f"‚ö†Ô∏è Quality check failed for {task_id}, retrying...")
                    continue

                self.results[task_id] = result
                self.task_store[task_id]['status'] = 'COMPLETED'
                self.task_store[task_id]['h2e_score'] = alignment_score.score
                self.task_store[task_id]['h2e_passed'] = alignment_score.passed

                self.completion_history.append({
                    'task_id': task_id,
                    'role': task['agent_role'],
                    'timestamp': time.time(),
                    'quality_check': quality_ok,
                    'h2e_score': alignment_score.score,
                    'h2e_passed': alignment_score.passed,
                    'attempt': attempt + 1
                })

                status_icon = "üìä" if self.calibration_mode else "‚úÖ"
                print(f"{status_icon} Completed: {task_id} (H2E: {alignment_score.score:.3f}, Attempt: {attempt + 1})")
                return

            except Exception as e:
                if attempt < retries - 1:
                    wait_time = 2 ** attempt
                    print(f"‚ö†Ô∏è Task {task_id} failed (attempt {attempt + 1}/{retries}): {e}. Retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"‚ùå Task {task_id} permanently failed after {retries} attempts: {e}")
                    self.task_store[task_id]['status'] = 'FAILED'
                    self.task_store[task_id]['error'] = str(e)
                    self.results[task_id] = f"[Task failed: {e}]"

    def quality_check(self, task_id, content, alignment_score=None):
        """Enhanced quality checks with H2E alignment"""
        task = self.task_store[task_id]

        issues = []

        # Basic length check (role-specific)
        if 'Writer' in task['agent_role']:
            if len(content.strip()) < 600:
                issues.append("Writing task too short (<600 chars)")
        else:
            if len(content.strip()) < 300:
                issues.append("Content too short (<300 chars)")

        # H2E alignment check (skip in calibration mode)
        if not self.calibration_mode and alignment_score and not alignment_score.passed:
            issues.append(f"H2E alignment failed (score: {alignment_score.score:.3f}, required: {alignment_score.threshold_required})")

        # Check for structure
        has_structure = any(marker in content for marker in ['#', '##', '###', '1.', '2.', '3.', '- ', '* ', '‚Ä¢ '])
        if not has_structure and len(content) > 400:
            issues.append("Content lacks clear structure (no headings/lists)")

        # Check for completeness (proper ending)
        trimmed_content = content.strip()
        if trimmed_content:
            # Check if ends with proper punctuation
            if not any(trimmed_content.endswith(punct) for punct in ['.', '!', '?', '```']):
                # Check last 50 chars for conclusion indicators
                last_part = trimmed_content[-50:].lower()
                if not any(indicator in last_part for indicator in ['conclusion', 'summary', 'finally', 'in summary']):
                    issues.append("Content may lack proper conclusion")

        if issues:
            print(f"‚ö†Ô∏è  Quality issues for {task_id}: {', '.join(issues)}")
            return False
        return True

    def show_progress_bar(self):
        """Visual progress bar with H2E indicators"""
        completed = sum(1 for t in self.task_store.values() if t['status'] == 'COMPLETED')
        total = self.total_tasks
        if total == 0:
            return

        h2e_scores = [t.get('h2e_score', 0) for t in self.task_store.values()
                     if t['status'] == 'COMPLETED' and 'h2e_score' in t]
        avg_h2e = np.mean(h2e_scores) if h2e_scores else 0

        bar_length = 40
        filled_length = int(bar_length * completed // total)
        bar = '‚ñà' * filled_length + '‚ñë' * (bar_length - filled_length)
        percentage = (completed / total) * 100

        elapsed = time.time() - self.start_time if self.start_time else 0

        mode_indicator = " [CAL]" if self.calibration_mode else ""
        h2e_indicator = f" | H2E: {avg_h2e:.3f}" if h2e_scores else ""
        print(f"\rProgress{mode_indicator}: |{bar}| {completed}/{total} ({percentage:.1f}%){h2e_indicator} | ‚è±Ô∏è {elapsed:.1f}s", end='', flush=True)

    def print_progress(self):
        """Display execution progress"""
        completed = sum(1 for t in self.task_store.values() if t['status'] == 'COMPLETED')
        total = self.total_tasks
        if total == 0:
            return

        elapsed = time.time() - self.start_time if self.start_time else 0

        self.show_progress_bar()

        # Show running tasks occasionally
        if int(elapsed) % 10 == 0:
            print()

            running_tasks = []
            for task_id, task in self.task_store.items():
                if task['status'] == 'RUNNING':
                    h2e_info = ""
                    if 'h2e_score' in task:
                        h2e_info = f" (H2E: {task['h2e_score']:.3f})"
                    running_tasks.append(f"{task_id}{h2e_info}")

            if running_tasks:
                print(f"   Active: {', '.join(running_tasks)}")

    def estimate_cost(self):
        """Estimate API costs"""
        input_cost_per_million = 75.00
        output_cost_per_million = 375.00

        input_cost = (self.token_usage['input'] / 1_000_000) * input_cost_per_million
        output_cost = (self.token_usage['output'] / 1_000_000) * output_cost_per_million
        total_cost = input_cost + output_cost

        print(f"\nüí∞ Cost Estimation:")
        print(f"   Input tokens: {self.token_usage['input']:,} ‚âà ${input_cost:.4f}")
        print(f"   Output tokens: {self.token_usage['output']:,} ‚âà ${output_cost:.4f}")
        print(f"   Total estimated: ${total_cost:.4f}")

        return total_cost

    def run_orchestration(self, timeout_seconds=600):
        """Phase 3: Execution loop with H2E accountability"""
        mode_label = "CALIBRATION" if self.calibration_mode else "ACCOUNTABLE"
        print(f"‚öôÔ∏è Orchestrator: Starting execution loop...")
        print(f"üîí H2E Accountability Framework: {mode_label}")
        self.start_time = time.time()

        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = {}

            while True:
                current_time = time.time()

                if current_time - self.start_time > timeout_seconds:
                    print(f"\n‚è∞ Timeout after {timeout_seconds} seconds")
                    break

                self.print_progress()

                ready_tasks = []
                for task_id, task in self.task_store.items():
                    if task['status'] == 'PENDING':
                        deps_ready = True
                        for dep in task['dependencies']:
                            if self.task_store.get(dep, {}).get('status') != 'COMPLETED':
                                deps_ready = False
                                break

                        if deps_ready:
                            ready_tasks.append(task_id)

                for task_id in ready_tasks:
                    if task_id not in futures or futures[task_id].done():
                        self.task_store[task_id]['status'] = 'RUNNING'
                        futures[task_id] = executor.submit(self.execute_task, task_id)

                all_done = all(
                    t['status'] in ['COMPLETED', 'FAILED']
                    for t in self.task_store.values()
                )

                if all_done:
                    print("\n\nüéØ All tasks completed!")
                    break

                time.sleep(0.5)

        print()
        self.show_progress_bar()
        print("\n" + "="*50)
        return self.results

if __name__ == "__main__":
    # CALIBRATION MODE: Set to True for first run to establish baselines
    # Set to False for production runs with accountability
    calibration_mode = True  # First run should be calibration

    orch = Orchestrator(calibration_mode=calibration_mode)

    user_goal = "Create a 3-step technical blog post about Kubernetes security: 1. Research, 2. Write, 3. Review."

    try:
        mode_str = "CALIBRATION" if calibration_mode else "ACCOUNTABLE"
        print(f"üöÄ Starting DAG Orchestrator with H2E {mode_str} Mode")
        print(f"Goal: {user_goal}")
        print("-" * 50)

        plan = orch.get_plan(user_goal)

        print("\nüìã Task Plan:")
        for i, task in enumerate(plan['tasks'], 1):
            deps = ', '.join(task['dependencies']) if task['dependencies'] else 'None'
            print(f"  {i}. {task['id']}: {task['description'][:80]}...")
            print(f"     Role: {task['agent_role']}, Depends on: {deps}")

        print("\n" + "="*50)
        print("Starting execution...")
        results = orch.run_orchestration()

        print("\n" + "="*50 + "\nüèÅ BUILD COMPLETE\n" + "="*50)

        successful = sum(1 for t in orch.task_store.values() if t['status'] == 'COMPLETED')
        failed = sum(1 for t in orch.task_store.values() if t['status'] == 'FAILED')

        h2e_scores = [t.get('h2e_score', 0) for t in orch.task_store.values()
                     if t.get('status') == 'COMPLETED' and 'h2e_score' in t]
        avg_h2e = np.mean(h2e_scores) if h2e_scores else 0
        h2e_passed = sum(1 for t in orch.task_store.values()
                        if t.get('h2e_passed', False))

        print(f"\nüìà Summary:")
        print(f"   Tasks: {successful} successful, {failed} failed")

        if calibration_mode:
            print(f"   H2E Calibration Scores: Avg {avg_h2e:.3f}")
            # Analyze calibration data
            recommendations = orch.h2e.analyze_calibration_data()
            print(f"\nüí° Recommendations for production thresholds:")
            for role, threshold in recommendations.items():
                print(f"   {role}: {threshold:.3f}")
        else:
            print(f"   H2E Alignment: {h2e_passed}/{successful} passed, Avg score: {avg_h2e:.3f}")

        # Generate and display H2E report
        print("\nüìä H2E Accountability Report:")
        h2e_report = orch.h2e.save_report("output/h2e_accountability_report.json")

        if 'h2e_framework_report' in h2e_report:
            report_data = h2e_report['h2e_framework_report']
            print(f"   Timestamp: {report_data.get('timestamp', 'N/A')}")
            print(f"   Tasks Evaluated: {report_data.get('total_tasks_evaluated', 0)}")
            print(f"   Average SROI: {report_data.get('average_sroi_score', 0):.3f}")
            print(f"   Alignment Pass Rate: {report_data.get('alignment_pass_rate', '0%')}")
            print(f"   Recommended Threshold: {report_data.get('recommended_threshold', 0):.3f}")

            if 'task_breakdown' in report_data:
                print(f"\n   Task Breakdown:")
                for task in report_data['task_breakdown']:
                    status = "‚úÖ" if task['passed'] else "‚ùå"
                    print(f"     {status} {task['task_id']} ({task['role']}): {task['sroi_score']:.3f} (required: {task['required_threshold']}) - {task['explanation']}")

        orch.estimate_cost()

        os.makedirs("output", exist_ok=True)

        print("\nüìÇ Output Files:")
        for tid, content in results.items():
            status = orch.task_store[tid]['status']
            role = orch.task_store[tid]['agent_role']
            h2e_score = orch.task_store[tid].get('h2e_score', 'N/A')

            clean_role = role.replace(' ', '_').replace('/', '_')
            filename = f"output/{tid}_{clean_role}.txt"

            with open(filename, 'w', encoding='utf-8') as f:
                f.write(f"Task: {tid}\n")
                f.write(f"Role: {role}\n")
                f.write(f"Status: {status}\n")
                f.write(f"H2E Alignment Score: {h2e_score}\n")
                f.write(f"Generated: {time.ctime()}\n")
                f.write("="*60 + "\n\n")
                f.write(content)

            print(f"  {tid}: {filename} (H2E: {h2e_score})")

        # Save completion history
        with open("output/completion_history.json", "w") as f:
            json.dump(orch.completion_history, f, indent=2, default=str)
        print("  Completion History: output/completion_history.json")

    except Exception as e:
        print(f"\n‚ùå FATAL ERROR: {e}")
        import traceback
        traceback.print_exc()

üöÄ Starting DAG Orchestrator with H2E CALIBRATION Mode
Goal: Create a 3-step technical blog post about Kubernetes security: 1. Research, 2. Write, 3. Review.
--------------------------------------------------
üß† Brain: Planning via claude-opus-4-6...
üìä H2E/NEZ: Captured expert intent for task1 (Research Analyst)
üìä H2E/NEZ: Captured expert intent for task2 (Technical Writer)
üìä H2E/NEZ: Captured expert intent for task3 (Technical Reviewer)
‚úì Planned 3 tasks
‚úì DAG validation passed (no circular dependencies)

üìã Task Plan:
  1. task1: Research Kubernetes security best practices, common vulnerabilities (such as mis...
     Role: Research Analyst, Depends on: None
  2. task2: Using the research brief from task1, write a comprehensive technical blog post a...
     Role: Technical Writer, Depends on: task1
  3. task3: Review the blog post from task2 for technical accuracy, completeness, clarity, a...
     Role: Technical Reviewer, Depends on: task2

Starting execution...
‚ö

## DS-H2E

In [38]:
import os
import json
import time
import numpy as np
import hashlib
from typing import Dict, List, Optional, Tuple, Any, Set
import anthropic
from concurrent.futures import ThreadPoolExecutor, Future
from dataclasses import dataclass, field
from datetime import datetime
import re
import threading
from queue import PriorityQueue
from enum import Enum

# 1. SETUP CLIENT
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
    raise ValueError("ANTHROPIC_API_KEY environment variable not set.")

client = anthropic.Anthropic(api_key=api_key)
MODEL_NAME = "claude-opus-4-6"  # Using the original model name

class TaskStatus(Enum):
    PENDING = "PENDING"
    RUNNING = "RUNNING"
    COMPLETED = "COMPLETED"
    FAILED = "FAILED"

# H2E Framework Components
@dataclass
class ExpertIntentVector:
    """NEZ (Normalized Expert Zone): Encoded expert intent"""
    task_id: str
    role: str
    vector: np.ndarray
    timestamp: datetime = field(default_factory=datetime.now)
    description: str = ""
    gold_standard_examples: List[str] = field(default_factory=list)
    complexity_score: float = 0.5  # Default medium complexity

    def to_dict(self) -> Dict:
        """Convert to JSON-serializable dictionary"""
        return {
            "task_id": self.task_id,
            "role": self.role,
            "vector": self.vector.tolist(),
            "timestamp": self.timestamp.isoformat(),
            "description": self.description,
            "gold_standard_examples": self.gold_standard_examples,
            "complexity_score": float(self.complexity_score)
        }

@dataclass
class AlignmentScore:
    """SROI (Semantic ROI): Alignment measurement"""
    task_id: str
    score: float
    threshold_required: float
    passed: bool
    vector_distance: float
    explanation: str = ""
    confidence: float = 0.5
    raw_similarity: float = 0.0  # Raw cosine similarity before scaling

    def to_dict(self) -> Dict:
        """Convert to JSON-serializable dictionary"""
        return {
            "task_id": self.task_id,
            "score": float(self.score),
            "threshold_required": float(self.threshold_required),
            "passed": bool(self.passed),
            "vector_distance": float(self.vector_distance),
            "explanation": self.explanation,
            "confidence": float(self.confidence),
            "raw_similarity": float(self.raw_similarity)
        }

class H2EAccountabilityEngine:
    """H2E-inspired accountability framework with enhanced vector encoding"""

    def __init__(self, calibration_mode: bool = False):
        self.expert_vectors: Dict[str, ExpertIntentVector] = {}  # NEZ: Stores expert intent vectors
        self.alignment_history: List[AlignmentScore] = []  # SROI: Tracks all alignment scores
        self.calibration_mode = calibration_mode  # Controls if we're in calibration or production mode
        self.calibration_data: List[Dict] = []  # Collects data for threshold calibration
        self.role_performance_history: Dict[str, List[float]] = {}  # Track historical performance by role
        self._vector_cache: Dict[str, np.ndarray] = {}  # Cache for vector computations

    def _compute_text_complexity(self, text: str) -> float:
        """Compute text complexity score (0-1)"""
        if not text:
            return 0.5

        words = text.split()
        word_count = len(words)

        # Factors: length, vocabulary diversity, technical terms
        unique_words = len(set(words))
        vocab_richness = unique_words / max(word_count, 1)

        # Check for technical indicators
        technical_indicators = ['algorithm', 'architecture', 'security', 'implementation',
                               'optimization', 'performance', 'scalability', 'vulnerability',
                               'kubernetes', 'container', 'microservice', 'orchestration']
        technical_density = sum(1 for word in words if word.lower() in technical_indicators) / max(word_count, 1)

        # Combine factors
        complexity = (
            0.4 * min(1.0, word_count / 200) +  # Length factor
            0.3 * vocab_richness +              # Vocabulary richness
            0.3 * min(1.0, technical_density * 5)  # Technical density
        )

        return min(1.0, max(0.0, complexity))

    def capture_expert_intent(self, task_id: str, role: str,
                             description: str, examples: List[str]) -> ExpertIntentVector:
        """NEZ: Capture expert intent as encoded vector with complexity scoring"""
        intent_text = f"{role}: {description}. Examples: {' '.join(examples[:3])}"
        vector = self._text_to_vector_enhanced(intent_text)

        # Calculate complexity
        complexity = self._compute_text_complexity(description)

        expert_vector = ExpertIntentVector(
            task_id=task_id,
            role=role,
            vector=vector,
            description=description,
            gold_standard_examples=examples,
            complexity_score=complexity
        )

        self.expert_vectors[task_id] = expert_vector

        # Initialize role performance tracking
        if role not in self.role_performance_history:
            self.role_performance_history[role] = []

        print(f"üìä H2E/NEZ: Captured expert intent for {task_id} ({role}, complexity: {complexity:.2f})")
        return expert_vector

    def _text_to_vector_enhanced(self, text: str) -> np.ndarray:
        """Enhanced vector encoding with semantic features - FIXED VERSION"""
        # Cache vectors to avoid recomputation
        cache_key = hashlib.md5(text.encode()).hexdigest()
        if cache_key in self._vector_cache:
            return self._vector_cache[cache_key]

        text = text.lower().strip()

        # Enhanced preprocessing
        words = [re.sub(r'[^\w\s]', '', word) for word in text.split()]
        words = [word for word in words if len(word) > 2 and word not in ['the', 'and', 'for', 'with', 'this', 'that']]

        # Create 512-dimensional vector for better discrimination
        vector = np.zeros(512)

        if not words:
            # Return uniform vector for empty text
            vector = np.ones(512) / np.sqrt(512)
            self._vector_cache[cache_key] = vector
            return vector

        # Enhanced feature extraction with better discrimination
        for i, word in enumerate(words[:100]):  # Limit to 100 words
            # Create multiple hash positions for each word
            word_bytes = word.encode()
            md5_hash = hashlib.md5(word_bytes).hexdigest()

            # Use different parts of hash for different positions
            for j in range(4):
                # Take 2 hex chars (1 byte) for position
                pos_hash = int(md5_hash[j*2:(j+1)*2], 16)
                pos = (pos_hash + j * 128) % 512

                # Weight decreases with word position
                position_weight = 1.0 - (i / len(words))
                # Base weight with some randomness
                base_weight = 0.2 + (hash(word) % 100) / 500

                vector[pos] += base_weight * position_weight

        # Add n-gram features for better semantic capture
        for i in range(len(words) - 1):
            bigram = f"{words[i]}_{words[i+1]}"
            bigram_hash = hash(bigram) % 512
            vector[bigram_hash] += 0.15

        # Normalize to unit length
        norm = np.linalg.norm(vector)
        if norm > 0:
            vector = vector / norm
        else:
            vector = np.ones(512) / np.sqrt(512)

        # Add small amount of noise to ensure uniqueness
        noise = np.random.normal(0, 0.01, 512)
        vector = vector + noise

        # Renormalize
        norm = np.linalg.norm(vector)
        if norm > 0:
            vector = vector / norm

        self._vector_cache[cache_key] = vector
        return vector

    def calculate_sroi(self, task_id: str, generated_output: str,
                      task_description: str = "") -> AlignmentScore:
        """SROI: Calculate alignment with expert intent - FIXED VERSION"""

        if task_id not in self.expert_vectors:
            # Create expert vector if not exists
            self.capture_expert_intent(
                task_id=task_id,
                role="Unknown",
                description=task_description or f"Task {task_id}",
                examples=[generated_output[:500]]
            )

        expert_vector = self.expert_vectors[task_id]
        output_vector = self._text_to_vector_enhanced(generated_output)

        # Calculate cosine similarity (normalized dot product)
        expert_norm = np.linalg.norm(expert_vector.vector)
        output_norm = np.linalg.norm(output_vector)

        if expert_norm == 0 or output_norm == 0:
            raw_similarity = 0.0
            similarity = 0.0
        else:
            # Cosine similarity range: -1 to 1
            raw_similarity = np.dot(expert_vector.vector, output_vector) / (expert_norm * output_norm)

            # Scale to 0-1 range with non-linear mapping
            # Cosine similarity of 0 maps to 0.5, 1 maps to 1.0, -1 maps to 0.0
            similarity = (raw_similarity + 1) / 2

        # Apply soft transformation to prevent saturation
        # Use cubic mapping to spread out middle values
        similarity = similarity ** (1/1.5)  # Makes middle values more spread out

        # Add small variability for calibration mode (more realistic)
        if self.calibration_mode:
            # Add realistic noise: 2-8% variation
            noise = np.random.normal(0, 0.03)  # 3% standard deviation
            similarity += noise

        # Ensure reasonable bounds
        similarity = max(0.1, min(0.95, similarity))

        # Calculate confidence based on multiple factors
        # 1. Vector consistency (low std dev = high confidence)
        vector_std = np.std(output_vector)
        consistency_score = 1.0 - min(1.0, vector_std / 0.3)

        # 2. Content length factor (longer content = more reliable)
        content_length = len(generated_output.strip())
        length_score = min(1.0, content_length / 1500)

        # 3. Vocabulary richness
        words = generated_output.lower().split()
        if len(words) > 20:
            unique_words = len(set(words))
            vocab_score = min(1.0, unique_words / max(50, len(words) * 0.7))
        else:
            vocab_score = 0.3

        # 4. Structure score (presence of headings, lists)
        has_structure = any(marker in generated_output for marker in ['# ', '## ', '### ', '- ', '* ', '1. ', '2. '])
        structure_score = 0.7 if has_structure else 0.3

        # Combined confidence
        confidence = (
            consistency_score * 0.3 +
            length_score * 0.25 +
            vocab_score * 0.25 +
            structure_score * 0.2
        )
        confidence = max(0.3, min(0.95, confidence))

        # Get adaptive threshold
        threshold = self._get_required_threshold(
            expert_vector.role,
            task_description,
            similarity,
            expert_vector.complexity_score
        )

        # In calibration mode, collect data without failing
        if self.calibration_mode:
            passed = True
            self.calibration_data.append({
                "task_id": task_id,
                "role": expert_vector.role,
                "score": similarity,
                "raw_similarity": float(raw_similarity),
                "description": task_description,
                "complexity": expert_vector.complexity_score,
                "confidence": confidence,
                "content_length": content_length
            })
        else:
            passed = similarity >= threshold

        # Calculate vector distance (for debugging)
        distance = np.linalg.norm(expert_vector.vector - output_vector)

        # Update role performance history
        self.role_performance_history.setdefault(expert_vector.role, []).append(similarity)

        # Generate detailed explanation
        explanation = self._generate_alignment_explanation(similarity, threshold, confidence, raw_similarity)

        score = AlignmentScore(
            task_id=task_id,
            score=similarity,
            threshold_required=threshold,
            passed=passed,
            vector_distance=distance,
            explanation=explanation,
            confidence=confidence,
            raw_similarity=raw_similarity
        )

        self.alignment_history.append(score)
        return score

    def _generate_alignment_explanation(self, similarity: float, threshold: float,
                                       confidence: float, raw_similarity: float) -> str:
        """Generate detailed explanation for alignment score"""

        # Different explanation based on raw cosine similarity
        if raw_similarity > 0.8:
            sim_type = "Very high semantic similarity"
        elif raw_similarity > 0.6:
            sim_type = "High semantic similarity"
        elif raw_similarity > 0.4:
            sim_type = "Moderate semantic similarity"
        elif raw_similarity > 0.2:
            sim_type = "Low semantic similarity"
        elif raw_similarity > 0:
            sim_type = "Very low semantic similarity"
        else:
            sim_type = "Negative semantic similarity"

        if similarity >= threshold * 1.3:
            if confidence > 0.8:
                return f"{sim_type}. Excellent alignment exceeding requirements by {((similarity/threshold)-1)*100:.0f}%"
            else:
                return f"{sim_type}. Good alignment but confidence is moderate ({confidence:.1%})"
        elif similarity >= threshold:
            if confidence > 0.7:
                return f"{sim_type}. Adequate alignment meeting requirements"
            else:
                return f"{sim_type}. Borderline alignment - meets threshold but confidence is low ({confidence:.1%})"
        elif similarity >= threshold * 0.8:
            gap = threshold - similarity
            return f"{sim_type}. Moderate alignment, {gap:.3f} below threshold"
        elif similarity >= threshold * 0.6:
            gap = threshold - similarity
            return f"{sim_type}. Poor alignment, {gap:.3f} below threshold"
        else:
            gap = threshold - similarity
            return f"{sim_type}. Very poor alignment, {gap:.3f} below threshold"

    def _get_required_threshold(self, role: str, description: str,
                               current_score: float = None, complexity: float = 0.5) -> float:
        """Get adaptive threshold based on role, complexity, and historical data"""

        # Base thresholds calibrated from analysis
        base_thresholds = {
            "Research Analyst": 0.532,
            "Technical Writer": 0.350,
            "Technical Reviewer": 0.656,
            "Technical Researcher": 0.45,
            "Writer": 0.50,
            "Editor": 0.55,
            "Reviewer": 0.55,
            "Default": 0.50
        }

        # Get base threshold
        threshold = base_thresholds.get(role, base_thresholds["Default"])

        # Adjust for task complexity (complex tasks require higher thresholds)
        complexity_adjustment = 1.0 + (complexity * 0.4)  # Up to 40% increase for complex tasks
        threshold *= complexity_adjustment

        # Apply risk multiplier for security/critical content
        description_lower = description.lower()
        security_keywords = ["security", "vulnerability", "risk", "attack", "breach",
                           "hack", "critical", "sensitive", "zero-trust", "authentication"]
        if any(keyword in description_lower for keyword in security_keywords):
            threshold *= 1.25  # 25% higher for security
            threshold = min(threshold, 0.85)  # Cap at 0.85

        # Use historical performance data if available
        if role in self.role_performance_history and self.role_performance_history[role]:
            historical_scores = self.role_performance_history[role]
            avg_historical = np.mean(historical_scores)
            std_historical = np.std(historical_scores) if len(historical_scores) > 1 else 0.1

            # Adjust threshold based on historical performance
            if len(historical_scores) >= 2:
                # More data = more trust in historical performance
                trust_factor = min(0.6, len(historical_scores) / 8)
                # Set threshold slightly below historical average
                historical_threshold = avg_historical - (std_historical * 0.3)
                threshold = (threshold * (1 - trust_factor)) + (historical_threshold * trust_factor)

        # If we have calibration data, adjust based on similar tasks
        if self.calibration_data and current_score is not None:
            similar_scores = [
                d["score"] for d in self.calibration_data
                if d["role"] == role and abs(d.get("complexity", 0.5) - complexity) < 0.3
            ]
            if similar_scores:
                avg_similar = np.mean(similar_scores)
                std_similar = np.std(similar_scores) if len(similar_scores) > 1 else 0.1
                # Adjust towards historical average with consideration for variance
                calibration_weight = min(0.5, len(similar_scores) / 15)
                calibrated_threshold = max(0.3, avg_similar - (std_similar * 0.4))
                threshold = (threshold * (1 - calibration_weight)) + (calibrated_threshold * calibration_weight)

        # Ensure reasonable bounds
        threshold = max(0.25, min(0.9, threshold))

        return round(threshold, 3)

    def analyze_calibration_data(self) -> Dict[str, float]:
        """Analyze calibration data to recommend thresholds"""
        if not self.calibration_data:
            print("‚ö†Ô∏è No calibration data available")
            return {}

        # Group by role
        role_data: Dict[str, List[Dict]] = {}
        for entry in self.calibration_data:
            role = entry["role"]
            if role not in role_data:
                role_data[role] = []
            role_data[role].append(entry)

        print("\nüîß H2E Calibration Analysis:")
        print("=" * 50)

        recommendations = {}
        for role, entries in role_data.items():
            scores = [e["score"] for e in entries]
            raw_similarities = [e.get("raw_similarity", 0) for e in entries]
            complexities = [e.get("complexity", 0.5) for e in entries]
            confidences = [e.get("confidence", 0.5) for e in entries]

            avg_score = np.mean(scores)
            std_score = np.std(scores) if len(scores) > 1 else 0
            avg_raw_sim = np.mean(raw_similarities)
            avg_complexity = np.mean(complexities)
            avg_confidence = np.mean(confidences)

            # Recommended threshold: consider complexity, variance, and confidence
            base_recommendation = max(0.3, avg_score - (std_score * 0.8))

            # Adjust for average complexity of this role
            complexity_adjustment = 1.0 + (avg_complexity * 0.3)
            adjusted_recommendation = base_recommendation * complexity_adjustment

            # Adjust based on confidence
            confidence_adjustment = 1.0 + ((1 - avg_confidence) * 0.2)  # Lower confidence = higher threshold
            final_recommendation = adjusted_recommendation * confidence_adjustment

            recommendations[role] = min(0.85, final_recommendation)

            print(f"\n  {role}:")
            print(f"    Samples: {len(scores)}")
            print(f"    Average Score: {avg_score:.3f}")
            print(f"    Raw Cosine Sim: {avg_raw_sim:.3f}")
            print(f"    Std Dev: {std_score:.3f}")
            print(f"    Avg Complexity: {avg_complexity:.3f}")
            print(f"    Avg Confidence: {avg_confidence:.3f}")
            print(f"    Recommended Threshold: {recommendations[role]:.3f}")

            # Show score distribution
            if len(scores) >= 3:
                percentiles = np.percentile(scores, [10, 25, 50, 75, 90])
                print(f"    Score Distribution:")
                print(f"      10%: {percentiles[0]:.3f}, 25%: {percentiles[1]:.3f}, 50%: {percentiles[2]:.3f}")
                print(f"      75%: {percentiles[3]:.3f}, 90%: {percentiles[4]:.3f}")

        return recommendations

    def generate_accountability_report(self) -> Dict[str, Any]:
        """Generate comprehensive H2E accountability report"""
        if not self.alignment_history:
            return {"status": "No alignment data available"}

        scores = [s.score for s in self.alignment_history]
        raw_similarities = [s.raw_similarity for s in self.alignment_history]
        avg_score = np.mean(scores) if scores else 0
        avg_raw_sim = np.mean(raw_similarities) if raw_similarities else 0
        std_score = np.std(scores) if len(scores) > 1 else 0

        # Get unique final scores (not retries)
        final_scores = {}
        for score in reversed(self.alignment_history):
            if score.task_id not in final_scores:
                final_scores[score.task_id] = score

        pass_rate = np.mean([1 if s.passed else 0 for s in final_scores.values()])

        # Calculate role-based statistics
        role_stats = {}
        for role in set(v.role for v in self.expert_vectors.values()):
            role_scores = [s.score for s in self.alignment_history
                          if s.task_id in self.expert_vectors and
                          self.expert_vectors[s.task_id].role == role]
            if role_scores:
                role_stats[role] = {
                    "avg_score": float(np.mean(role_scores)),
                    "std_score": float(np.std(role_scores)) if len(role_scores) > 1 else 0,
                    "count": len(role_scores)
                }

        report = {
            "h2e_framework_report": {
                "timestamp": datetime.now().isoformat(),
                "calibration_mode": self.calibration_mode,
                "total_tasks_evaluated": len(self.alignment_history),
                "unique_tasks": len(final_scores),
                "average_sroi_score": float(round(avg_score, 4)),
                "average_raw_similarity": float(round(avg_raw_sim, 4)),
                "sroi_std_dev": float(round(std_score, 4)),
                "alignment_pass_rate": f"{pass_rate*100:.1f}%",
                "expert_vectors_captured": len(self.expert_vectors),
                "recommended_threshold": float(round(max(0.35, avg_score - std_score/2), 3)),
                "role_statistics": role_stats,
                "task_breakdown": []
            }
        }

        # Add task breakdown
        for task_id, score in final_scores.items():
            expert_vector = self.expert_vectors.get(task_id)
            report["h2e_framework_report"]["task_breakdown"].append({
                "task_id": task_id,
                "role": expert_vector.role if expert_vector else "Unknown",
                "complexity": float(expert_vector.complexity_score) if expert_vector else 0.5,
                "sroi_score": float(round(score.score, 3)),
                "raw_similarity": float(round(score.raw_similarity, 3)),
                "required_threshold": float(score.threshold_required),
                "passed": bool(score.passed),
                "confidence": float(score.confidence),
                "explanation": score.explanation
            })

        return report

    def save_report(self, filename: str = "h2e_accountability_report.json"):
        """Save H2E report to file"""
        report = self.generate_accountability_report()

        class H2EJSONEncoder(json.JSONEncoder):
            def default(self, obj):
                if isinstance(obj, np.ndarray):
                    return obj.tolist()
                if isinstance(obj, datetime):
                    return obj.isoformat()
                if isinstance(obj, (ExpertIntentVector, AlignmentScore)):
                    return obj.to_dict()
                return super().default(obj)

        os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else ".", exist_ok=True)

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, cls=H2EJSONEncoder)

        print(f"‚úì H2E report saved to {filename}")
        return report

class Orchestrator:
    def __init__(self, calibration_mode: bool = False, max_workers: int = 3):
        self.task_store = {}
        self.results = {}
        self.start_time = None
        self.total_tasks = 0
        self.token_usage = {'input': 0, 'output': 0}
        self.completion_history = []
        self.max_workers = max_workers

        # H2E Accountability Engine
        self.h2e = H2EAccountabilityEngine(calibration_mode=calibration_mode)
        self.calibration_mode = calibration_mode

        # Enhanced expert templates based on analysis
        self.expert_templates = {
            "Research Analyst": {
                "description": "Comprehensive research with structured findings",
                "gold_standard_examples": [
                    "Research Summary: This analysis examines [topic] using [methods]. Key findings include: 1) [finding] 2) [finding] 3) [finding]",
                    "Methodology: Data was collected from [sources] and analyzed using [techniques] with attention to [specific considerations]",
                    "Conclusion: Based on the research, we recommend [recommendations] with justification [reasoning]"
                ],
                "priority": 2  # Medium priority
            },
            "Technical Writer": {
                "description": "Clear technical documentation with examples",
                "gold_standard_examples": [
                    "Introduction: Overview of [topic] and its importance in [context]",
                    "Implementation: Step-by-step instructions with code examples:\n```python\n# Comprehensive example\nimport module\nresult = module.function(param)\nprint(f'Result: {result}')\n```",
                    "Best Practices: Key recommendations including [do's] and [don'ts] for successful implementation",
                    "Troubleshooting: Common issues and their solutions"
                ],
                "priority": 1  # Lower priority (can run in parallel)
            },
            "Technical Reviewer": {
                "description": "Thorough technical review with specific feedback",
                "gold_standard_examples": [
                    "Overall Assessment: The document is [assessment] with strengths in [areas] and areas for improvement in [areas]",
                    "Technical Accuracy: Verified [claims] and validated [examples]. Found [issues] that need addressing",
                    "Suggestions: Recommend [specific improvements] for better clarity, accuracy, and completeness",
                    "Critical Issues: [Number] critical issues identified requiring immediate attention"
                ],
                "priority": 3  # High priority (often depends on other tasks)
            },
            "Technical Researcher": {
                "description": "In-depth technical research and analysis",
                "gold_standard_examples": [
                    "Research Summary: This analysis examines [topic] using [methods]. Key findings include: 1) [finding] 2) [finding] 3) [finding]",
                    "Methodology: Data was collected from [sources] and analyzed using [techniques] with validation via [validation method]",
                    "Technical Deep Dive: Detailed analysis of [specific aspect] showing [insights]",
                    "Conclusion: Based on the research, we recommend [recommendations] with supporting evidence [evidence]"
                ],
                "priority": 2
            },
            "Writer": {
                "description": "Professional writing with clear structure",
                "gold_standard_examples": [
                    "Introduction: Engaging overview of [topic] establishing context and importance",
                    "Main Content: Well-structured sections with clear transitions:\n## Section 1\nContent with examples\n## Section 2\nMore detailed exploration",
                    "Conclusion: Summary of key points and final thoughts or call to action"
                ],
                "priority": 1
            },
            "Editor": {
                "description": "Editorial review focusing on clarity and consistency",
                "gold_standard_examples": [
                    "Overall Assessment: The document achieves [purpose] with [strengths] and needs improvement in [areas]",
                    "Clarity Issues: Identified [number] unclear passages suggesting [improvements]",
                    "Consistency Check: Found inconsistencies in [aspects] recommending [standardization]",
                    "Grammar and Style: [Number] issues corrected, overall style is [assessment]"
                ],
                "priority": 2
            }
        }

        self.task_lock = threading.Lock()
        self.execution_order = []

    def _get_task_priority(self, task_id: str) -> float:
        """Calculate task priority based on role and dependencies"""
        task = self.task_store[task_id]
        role = task['agent_role']

        # Base priority from template
        template_priority = 1.0
        for template_key, template in self.expert_templates.items():
            if template_key.lower() in role.lower():
                template_priority = template.get("priority", 1.0)
                break

        # Adjust based on dependencies (more dependencies = higher priority to unblock others)
        dependency_factor = 1.0 + (len(task['dependencies']) * 0.2)

        # Adjust based on task complexity (from H2E)
        complexity = 0.5
        if task_id in self.h2e.expert_vectors:
            complexity = self.h2e.expert_vectors[task_id].complexity_score

        complexity_factor = 1.0 + complexity * 0.3

        final_priority = template_priority * dependency_factor * complexity_factor

        # Invert for PriorityQueue (lower number = higher priority)
        return -final_priority

    def get_plan(self, goal: str) -> Dict:
        """Phase 1: Planning with enhanced task generation"""
        print(f"üß† Brain: Planning via {MODEL_NAME}...")

        try:
            message = client.messages.create(
                model=MODEL_NAME,
                max_tokens=4000,
                system="""You are an AI Architect. Plan the project into a logical Directed Acyclic Graph (DAG).

                Return ONLY valid JSON with this exact structure:
                {
                    "tasks": [
                        {
                            "id": "task1",
                            "description": "Detailed task description",
                            "dependencies": [],
                            "agent_role": "Role name",
                            "estimated_complexity": "low|medium|high"
                        }
                    ]
                }

                Rules:
                1. Use simple task IDs like task1, task2, task3
                2. Make dependencies clear and logical (array of task IDs)
                3. Assign agent roles that match the task (Research Analyst, Technical Writer, Technical Reviewer, etc.)
                4. Ensure it's a valid DAG (no circular dependencies)
                5. Include 3-8 tasks total depending on goal complexity
                6. Add estimated_complexity based on task scope
                7. Create a logical workflow where later tasks depend on earlier ones""",
                messages=[{"role": "user", "content": f"{goal}\n\nReturn ONLY the JSON, no other text."}]
            )

            if hasattr(message, 'usage'):
                self.token_usage['input'] += message.usage.input_tokens
                self.token_usage['output'] += message.usage.output_tokens

            text = message.content[0].text

            # Extract JSON with better error handling
            json_match = re.search(r'\{[\s\S]*\}', text)
            if json_match:
                json_str = json_match.group()
                plan = json.loads(json_str)
            else:
                plan = json.loads(text)

            if 'tasks' not in plan:
                raise ValueError("Response missing 'tasks' key")

            with self.task_lock:
                for task in plan['tasks']:
                    if not all(key in task for key in ['id', 'description', 'dependencies', 'agent_role']):
                        raise ValueError(f"Task {task.get('id', 'unknown')} missing required fields")

                    task['status'] = TaskStatus.PENDING.value
                    task['estimated_complexity'] = task.get('estimated_complexity', 'medium')

                    # Calculate numeric complexity
                    complexity_map = {'low': 0.3, 'medium': 0.5, 'high': 0.8}
                    task['complexity_score'] = complexity_map.get(task['estimated_complexity'], 0.5)

                    self.task_store[task['id']] = task

                    role = task['agent_role']

                    # Find matching template
                    template_role = None
                    for template_key in self.expert_templates.keys():
                        if template_key.lower() in role.lower():
                            template_role = template_key
                            break

                    if template_role and template_role in self.expert_templates:
                        template = self.expert_templates[template_role]
                        self.h2e.capture_expert_intent(
                            task_id=task['id'],
                            role=role,
                            description=task['description'],
                            examples=template['gold_standard_examples']
                        )
                    else:
                        # Use default template
                        self.h2e.capture_expert_intent(
                            task_id=task['id'],
                            role=role,
                            description=task['description'],
                            examples=[f"Professional output for {role} focusing on {task['description'][:50]}..."]
                        )

            self.total_tasks = len(self.task_store)
            print(f"‚úì Planned {self.total_tasks} tasks")

            self.validate_dag()

            return plan

        except Exception as e:
            print(f"‚ùå Planning Error: {e}")
            if 'message' in locals():
                print("Content received:", message.content[0].text[:500])
            raise

    def validate_dag(self):
        """Validate that the DAG has no circular dependencies."""
        visited = set()
        recursion_stack = set()

        def has_cycle(task_id):
            visited.add(task_id)
            recursion_stack.add(task_id)

            task = self.task_store[task_id]
            for dep in task['dependencies']:
                if dep not in self.task_store:
                    raise ValueError(f"Dependency {dep} not found in tasks")
                if dep not in visited:
                    if has_cycle(dep):
                        return True
                elif dep in recursion_stack:
                    return True

            recursion_stack.remove(task_id)
            return False

        for task_id in self.task_store:
            if task_id not in visited:
                if has_cycle(task_id):
                    raise ValueError(f"Circular dependency detected involving task {task_id}")

        print("‚úì DAG validation passed (no circular dependencies)")

    def execute_task(self, task_id: str, retries: int = 2):
        """Phase 2: Task execution with enhanced H2E accountability"""
        task = self.task_store[task_id]
        print(f"üöÄ Dispatching [{task['agent_role']}]: {task_id}")

        # Build context from dependencies
        context_parts = []
        for dep in task['dependencies']:
            if dep in self.results:
                dep_content = self.results[dep]
                if len(dep_content) > 1500:
                    summary = self._summarize_content(dep_content[:2000])
                    context_parts.append(f"=== Summary from {dep} ===\n{summary}")
                else:
                    context_parts.append(f"=== Result from {dep} ===\n{dep_content}")

        context_text = "\n\n".join(context_parts) if context_parts else "No dependencies"

        # Adjust tokens based on complexity
        complexity = task.get('complexity_score', 0.5)
        base_tokens = 3000
        if complexity > 0.7:
            max_tokens = 4000  # More tokens for complex tasks
        elif complexity > 0.4:
            max_tokens = 3500
        else:
            max_tokens = 3000

        for attempt in range(retries):
            try:
                # Enhanced prompt with better guidance
                system_prompt = f"""You are a {task['agent_role']}. Your task is: {task['description']}

H2E ACCOUNTABILITY REQUIREMENTS:
1. Provide a COMPLETE, self-contained response that addresses all aspects of the task
2. Use clear hierarchical structure with appropriate headings/sections
3. Include specific examples, evidence, or data where relevant
4. End with a proper conclusion or summary that synthesizes key points
5. Maintain professional tone and ensure technical accuracy
6. If applicable, include actionable recommendations or next steps
7. Consider the context from dependencies but provide original analysis

OUTPUT FORMAT GUIDELINES:
- Start with a clear title or heading for the entire response
- Use markdown formatting (## for sections, ### for subsections)
- Include bullet points or numbered lists for clarity
- Add code examples in appropriate markdown format if relevant
- Use tables for comparative information if helpful
- Ensure the response is comprehensive yet focused"""

                user_prompt = f"""TASK DESCRIPTION:
{task['description']}

TASK COMPLEXITY: {task.get('estimated_complexity', 'medium').upper()}

CONTEXT FROM DEPENDENCIES:
{context_text}

INSTRUCTIONS:
1. Provide a complete, well-structured response addressing the task description
2. Use appropriate formatting (headings, lists, code blocks, etc.)
3. Include specific details, examples, and evidence
4. Ensure technical accuracy and completeness
5. Conclude with a summary, key takeaways, or next steps
6. Aim for depth and insight appropriate to the task complexity

Your professional response:"""

                response = client.messages.create(
                    model=MODEL_NAME,
                    max_tokens=max_tokens,
                    temperature=0.7,
                    system=system_prompt,
                    messages=[{
                        "role": "user",
                        "content": user_prompt
                    }]
                )

                if hasattr(response, 'usage'):
                    self.token_usage['input'] += response.usage.input_tokens
                    self.token_usage['output'] += response.usage.output_tokens

                result = response.content[0].text

                # Enhanced quality checks before H2E scoring
                quality_issues = self._pre_h2e_quality_check(task_id, result)
                if quality_issues and attempt < retries - 1:
                    print(f"‚ö†Ô∏è Pre-H2E quality issues for {task_id}: {', '.join(quality_issues)}, retrying...")
                    continue

                # H2E: Calculate alignment score
                alignment_score = self.h2e.calculate_sroi(
                    task_id=task_id,
                    generated_output=result,
                    task_description=task['description']
                )

                print(f"   H2E/SROI: Alignment score: {alignment_score.score:.3f} (Raw: {alignment_score.raw_similarity:.3f})")
                print(f"   Required: {alignment_score.threshold_required}, Confidence: {alignment_score.confidence:.2f}")
                print(f"   Explanation: {alignment_score.explanation}")

                # Post-H2E quality check
                quality_ok = self._post_h2e_quality_check(task_id, result, alignment_score)

                # In calibration mode, don't retry for alignment failures
                if not self.calibration_mode and not alignment_score.passed and attempt < retries - 1:
                    print(f"‚ö†Ô∏è H2E alignment failed for {task_id} (score: {alignment_score.score:.3f}), retrying...")
                    continue

                if not quality_ok and attempt < retries - 1:
                    print(f"‚ö†Ô∏è Post-H2E quality check failed for {task_id}, retrying...")
                    continue

                with self.task_lock:
                    self.results[task_id] = result
                    self.task_store[task_id]['status'] = TaskStatus.COMPLETED.value
                    self.task_store[task_id]['h2e_score'] = alignment_score.score
                    self.task_store[task_id]['h2e_passed'] = alignment_score.passed
                    self.task_store[task_id]['h2e_raw_similarity'] = alignment_score.raw_similarity
                    self.task_store[task_id]['h2e_confidence'] = alignment_score.confidence
                    self.task_store[task_id]['completion_time'] = time.time()

                self.completion_history.append({
                    'task_id': task_id,
                    'role': task['agent_role'],
                    'timestamp': time.time(),
                    'quality_check': quality_ok,
                    'h2e_score': alignment_score.score,
                    'h2e_raw_similarity': alignment_score.raw_similarity,
                    'h2e_passed': alignment_score.passed,
                    'h2e_confidence': alignment_score.confidence,
                    'attempt': attempt + 1,
                    'complexity': task.get('complexity_score', 0.5)
                })

                status_icon = "üìä" if self.calibration_mode else "‚úÖ"
                print(f"{status_icon} Completed: {task_id} (H2E: {alignment_score.score:.3f}, Raw: {alignment_score.raw_similarity:.3f}, Attempt: {attempt + 1})")
                return

            except anthropic.APIError as e:
                if attempt < retries - 1:
                    wait_time = 2 ** attempt
                    print(f"‚ö†Ô∏è API Error for {task_id} (attempt {attempt + 1}/{retries}): {e}. Retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"‚ùå Task {task_id} permanently failed after {retries} attempts: {e}")
                    with self.task_lock:
                        self.task_store[task_id]['status'] = TaskStatus.FAILED.value
                        self.task_store[task_id]['error'] = str(e)
                        self.results[task_id] = f"[Task failed: {e}]"
            except Exception as e:
                print(f"‚ùå Unexpected error for {task_id}: {e}")
                with self.task_lock:
                    self.task_store[task_id]['status'] = TaskStatus.FAILED.value
                    self.task_store[task_id]['error'] = str(e)
                    self.results[task_id] = f"[Task failed: {e}]"
                break

    def _summarize_content(self, content: str, max_length: int = 500) -> str:
        """Summarize content for context building"""
        if len(content) <= max_length:
            return content

        # Simple summarization: take beginning and end
        start = content[:max_length // 2]
        end = content[-(max_length // 2):]
        return f"{start}\n\n[...content truncated...]\n\n{end}"

    def _pre_h2e_quality_check(self, task_id: str, content: str) -> List[str]:
        """Quick quality checks before H2E scoring"""
        task = self.task_store[task_id]
        issues = []

        # Length check
        content_len = len(content.strip())
        if content_len < 200:
            issues.append(f"Too short ({content_len} chars)")

        # Check for extreme repetition
        words = content.lower().split()
        if len(words) > 100:
            unique_ratio = len(set(words)) / len(words)
            if unique_ratio < 0.3:  # Too repetitive
                issues.append(f"Low vocabulary diversity ({unique_ratio:.2f})")

        # Check for proper formatting (for longer content)
        if content_len > 500:
            has_headings = any(marker in content for marker in ['# ', '## ', '### '])
            has_lists = any(marker in content for marker in ['- ', '* ', '‚Ä¢ ', '1. ', '2. '])
            if not (has_headings or has_lists):
                issues.append("Lacks structure (no headings/lists)")

        return issues

    def _post_h2e_quality_check(self, task_id: str, content: str, alignment_score: AlignmentScore) -> bool:
        """Enhanced quality checks with H2E alignment"""
        task = self.task_store[task_id]

        issues = []

        # Basic length check (role-specific)
        role = task['agent_role']
        content_len = len(content.strip())

        if 'Writer' in role or 'Editor' in role:
            min_length = 800
        elif 'Researcher' in role or 'Analyst' in role:
            min_length = 1000
        else:
            min_length = 500

        if content_len < min_length:
            issues.append(f"Content too short ({content_len} < {min_length} chars)")

        # H2E alignment check (skip in calibration mode)
        if not self.calibration_mode and alignment_score and not alignment_score.passed:
            issues.append(f"H2E alignment failed (score: {alignment_score.score:.3f}, required: {alignment_score.threshold_required})")

        # Structure check for substantial content
        if content_len > 400:
            has_structure = any(marker in content for marker in ['#', '##', '###', '1.', '2.', '3.', '- ', '* ', '‚Ä¢ '])
            if not has_structure:
                issues.append("Content lacks clear structure (no headings/lists)")

        # Check for completeness (proper ending)
        trimmed_content = content.strip()
        if trimmed_content:
            # Check if ends with proper punctuation
            if not any(trimmed_content.endswith(punct) for punct in ['.', '!', '?', '```']):
                # Check last 100 chars for conclusion indicators
                last_part = trimmed_content[-100:].lower()
                if not any(indicator in last_part for indicator in ['conclusion', 'summary', 'finally', 'in summary', 'key takeaways', 'recommendations']):
                    issues.append("Content may lack proper conclusion")

        # Check for placeholder text
        placeholder_patterns = ['TODO:', 'FIXME:', 'INSERT', 'ADD HERE', 'TO BE DETERMINED']
        for pattern in placeholder_patterns:
            if pattern in content.upper():
                issues.append(f"Contains placeholder text ({pattern})")

        if issues:
            print(f"‚ö†Ô∏è  Quality issues for {task_id}: {', '.join(issues)}")
            return False
        return True

    def _get_ready_tasks(self) -> List[Tuple[float, str]]:
        """Get ready tasks with priority scores"""
        ready_tasks = []

        with self.task_lock:
            for task_id, task in self.task_store.items():
                if task['status'] == TaskStatus.PENDING.value:
                    # Check dependencies
                    deps_ready = True
                    for dep in task['dependencies']:
                        dep_task = self.task_store.get(dep)
                        if not dep_task or dep_task['status'] != TaskStatus.COMPLETED.value:
                            deps_ready = False
                            break

                    if deps_ready:
                        priority = self._get_task_priority(task_id)
                        ready_tasks.append((priority, task_id))

        # Sort by priority (higher priority first)
        ready_tasks.sort()
        return ready_tasks

    def show_progress_bar(self):
        """Visual progress bar with H2E indicators"""
        with self.task_lock:
            completed = sum(1 for t in self.task_store.values() if t['status'] == TaskStatus.COMPLETED.value)
            total = self.total_tasks

        if total == 0:
            return

        with self.task_lock:
            h2e_scores = [t.get('h2e_score', 0) for t in self.task_store.values()
                         if t['status'] == TaskStatus.COMPLETED.value and 'h2e_score' in t]

        avg_h2e = np.mean(h2e_scores) if h2e_scores else 0

        bar_length = 40
        filled_length = int(bar_length * completed // total)
        bar = '‚ñà' * filled_length + '‚ñë' * (bar_length - filled_length)
        percentage = (completed / total) * 100

        elapsed = time.time() - self.start_time if self.start_time else 0

        mode_indicator = " [CAL]" if self.calibration_mode else ""
        h2e_indicator = f" | H2E: {avg_h2e:.3f}" if h2e_scores else ""

        # Add ETA if possible
        eta = ""
        if completed > 0 and elapsed > 10:
            remaining = total - completed
            time_per_task = elapsed / completed
            eta_seconds = remaining * time_per_task
            if eta_seconds < 60:
                eta = f" | ETA: {eta_seconds:.0f}s"
            else:
                eta = f" | ETA: {eta_seconds/60:.1f}m"

        print(f"\rProgress{mode_indicator}: |{bar}| {completed}/{total} ({percentage:.1f}%){h2e_indicator}{eta} | ‚è±Ô∏è {elapsed:.1f}s",
              end='', flush=True)

    def print_progress(self):
        """Display execution progress"""
        completed = 0
        total = 0

        with self.task_lock:
            completed = sum(1 for t in self.task_store.values() if t['status'] == TaskStatus.COMPLETED.value)
            total = self.total_tasks

        if total == 0:
            return

        elapsed = time.time() - self.start_time if self.start_time else 0

        self.show_progress_bar()

        # Show running tasks occasionally
        if int(elapsed) % 15 == 0:
            print()

            running_tasks = []
            with self.task_lock:
                for task_id, task in self.task_store.items():
                    if task['status'] == TaskStatus.RUNNING.value:
                        h2e_info = ""
                        if 'h2e_score' in task:
                            h2e_info = f" (H2E: {task['h2e_score']:.3f})"
                        running_tasks.append(f"{task_id}{h2e_info}")

            if running_tasks:
                print(f"   Active: {', '.join(running_tasks)}")

    def estimate_cost(self) -> float:
        """Estimate API costs"""
        # Claude Opus pricing (updated)
        input_cost_per_million = 75.00  # $75.00 per million input tokens
        output_cost_per_million = 375.00  # $375.00 per million output tokens

        input_cost = (self.token_usage['input'] / 1_000_000) * input_cost_per_million
        output_cost = (self.token_usage['output'] / 1_000_000) * output_cost_per_million
        total_cost = input_cost + output_cost

        print(f"\nüí∞ Cost Estimation:")
        print(f"   Input tokens: {self.token_usage['input']:,} ‚âà ${input_cost:.4f}")
        print(f"   Output tokens: {self.token_usage['output']:,} ‚âà ${output_cost:.4f}")
        print(f"   Total estimated: ${total_cost:.4f}")

        return total_cost

    def run_orchestration(self, timeout_seconds: int = 1200):
        """Phase 3: Enhanced execution loop with H2E accountability"""
        mode_label = "CALIBRATION" if self.calibration_mode else "ACCOUNTABLE"
        print(f"‚öôÔ∏è Orchestrator: Starting execution loop...")
        print(f"üîí H2E Accountability Framework: {mode_label}")
        print(f"üë• Max Workers: {self.max_workers}")
        self.start_time = time.time()

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures: Dict[str, Future] = {}

            while True:
                current_time = time.time()

                # Timeout check
                if current_time - self.start_time > timeout_seconds:
                    print(f"\n‚è∞ Timeout after {timeout_seconds} seconds")
                    break

                # Progress display
                self.print_progress()

                # Get ready tasks
                ready_tasks = self._get_ready_tasks()

                # Submit ready tasks
                for priority, task_id in ready_tasks:
                    if task_id not in futures or futures[task_id].done():
                        with self.task_lock:
                            self.task_store[task_id]['status'] = TaskStatus.RUNNING.value

                        future = executor.submit(self.execute_task, task_id)
                        futures[task_id] = future

                        # Limit concurrent submissions
                        if len([f for f in futures.values() if not f.done()]) >= self.max_workers:
                            break

                # Check completion
                all_done = True
                with self.task_lock:
                    for task in self.task_store.values():
                        if task['status'] not in [TaskStatus.COMPLETED.value, TaskStatus.FAILED.value]:
                            all_done = False
                            break

                if all_done:
                    print("\n\nüéØ All tasks completed!")
                    break

                # Clean up completed futures
                completed_tasks = [task_id for task_id, future in futures.items() if future.done()]
                for task_id in completed_tasks:
                    if task_id in futures:
                        del futures[task_id]

                # Small sleep to prevent CPU spinning
                time.sleep(0.2)

        print()
        self.show_progress_bar()
        print("\n" + "="*60)
        return self.results

if __name__ == "__main__":
    # Configuration
    CALIBRATION_MODE = True  # Set to True for first run to establish baselines
    MAX_WORKERS = 3  # Adjust based on your API rate limits

    # Initialize orchestrator
    orch = Orchestrator(calibration_mode=CALIBRATION_MODE, max_workers=MAX_WORKERS)

    # User goal - you can modify this
    user_goal = """Create a comprehensive guide on implementing zero-trust security architecture in Kubernetes:
    1. Research current best practices and frameworks
    2. Write detailed implementation guide with examples
    3. Review and refine the content for technical accuracy
    4. Create a summary for executive audience"""

    try:
        mode_str = "CALIBRATION" if CALIBRATION_MODE else "ACCOUNTABLE"
        print(f"üöÄ Starting DAG Orchestrator with H2E {mode_str} Mode")
        print(f"Goal: {user_goal}")
        print("-" * 60)

        # Phase 1: Planning
        plan = orch.get_plan(user_goal)

        print("\nüìã Task Plan:")
        for i, task in enumerate(plan['tasks'], 1):
            deps = ', '.join(task['dependencies']) if task['dependencies'] else 'None'
            complexity = task.get('estimated_complexity', 'medium')
            print(f"  {i}. {task['id']}: {task['description'][:100]}...")
            print(f"     Role: {task['agent_role']}, Complexity: {complexity.upper()}, Depends on: {deps}")

        print("\n" + "="*60)
        print("Starting execution...\n")

        # Phase 2 & 3: Execution
        results = orch.run_orchestration()

        print("\n" + "="*60 + "\nüèÅ BUILD COMPLETE\n" + "="*60)

        # Statistics
        successful = sum(1 for t in orch.task_store.values() if t['status'] == TaskStatus.COMPLETED.value)
        failed = sum(1 for t in orch.task_store.values() if t['status'] == TaskStatus.FAILED.value)

        h2e_scores = [t.get('h2e_score', 0) for t in orch.task_store.values()
                     if t.get('status') == TaskStatus.COMPLETED.value and 'h2e_score' in t]
        h2e_raw_sims = [t.get('h2e_raw_similarity', 0) for t in orch.task_store.values()
                       if t.get('status') == TaskStatus.COMPLETED.value and 'h2e_raw_similarity' in t]
        avg_h2e = np.mean(h2e_scores) if h2e_scores else 0
        avg_raw_sim = np.mean(h2e_raw_sims) if h2e_raw_sims else 0
        h2e_passed = sum(1 for t in orch.task_store.values()
                        if t.get('h2e_passed', False))

        print(f"\nüìà Summary:")
        print(f"   Tasks: {successful} successful, {failed} failed")

        if CALIBRATION_MODE:
            print(f"   H2E Calibration Scores: Avg {avg_h2e:.3f} (Raw cosine: {avg_raw_sim:.3f})")
            # Analyze calibration data
            recommendations = orch.h2e.analyze_calibration_data()
            print(f"\nüí° Recommendations for production thresholds:")
            for role, threshold in recommendations.items():
                print(f"   {role}: {threshold:.3f}")
        else:
            print(f"   H2E Alignment: {h2e_passed}/{successful} passed, Avg score: {avg_h2e:.3f}")

        # Generate and display H2E report
        print("\nüìä H2E Accountability Report:")
        h2e_report = orch.h2e.save_report("output/h2e_accountability_report.json")

        if 'h2e_framework_report' in h2e_report:
            report_data = h2e_report['h2e_framework_report']
            print(f"   Timestamp: {report_data.get('timestamp', 'N/A')}")
            print(f"   Tasks Evaluated: {report_data.get('total_tasks_evaluated', 0)}")
            print(f"   Average SROI: {report_data.get('average_sroi_score', 0):.3f}")
            print(f"   Average Raw Similarity: {report_data.get('average_raw_similarity', 0):.3f}")
            print(f"   Alignment Pass Rate: {report_data.get('alignment_pass_rate', '0%')}")
            print(f"   Recommended Threshold: {report_data.get('recommended_threshold', 0):.3f}")

            if 'task_breakdown' in report_data:
                print(f"\n   Task Breakdown:")
                for task in report_data['task_breakdown']:
                    status = "‚úÖ" if task['passed'] else "‚ùå"
                    complexity_star = "‚≠ê" * max(1, int(task.get('complexity', 0.5) * 3))
                    print(f"     {status} {task['task_id']} ({task['role']}) {complexity_star}")
                    print(f"        Score: {task['sroi_score']:.3f} (Raw: {task['raw_similarity']:.3f})")
                    print(f"        Required: {task['required_threshold']:.3f}, Confidence: {task.get('confidence', 0.5):.2f}")
                    print(f"        {task['explanation']}")

        # Cost estimation
        orch.estimate_cost()

        # Save outputs
        os.makedirs("output", exist_ok=True)

        print("\nüìÇ Output Files:")
        for tid, content in results.items():
            status = orch.task_store[tid]['status']
            role = orch.task_store[tid]['agent_role']
            h2e_score = orch.task_store[tid].get('h2e_score', 'N/A')
            h2e_raw = orch.task_store[tid].get('h2e_raw_similarity', 'N/A')

            clean_role = role.replace(' ', '_').replace('/', '_')
            filename = f"output/{tid}_{clean_role}.md"  # Using markdown extension

            with open(filename, 'w', encoding='utf-8') as f:
                f.write(f"# Task: {tid}\n")
                f.write(f"**Role**: {role}\n")
                f.write(f"**Status**: {status}\n")
                f.write(f"**H2E Alignment Score**: {h2e_score} (Raw: {h2e_raw})\n")
                f.write(f"**H2E Confidence**: {orch.task_store[tid].get('h2e_confidence', 'N/A')}\n")
                f.write(f"**Generated**: {time.ctime()}\n")
                f.write(f"\n---\n\n")
                f.write(content)

            print(f"  {tid}: {filename} (H2E: {h2e_score}, Raw: {h2e_raw})")

        # Save completion history
        with open("output/completion_history.json", "w") as f:
            json.dump(orch.completion_history, f, indent=2, default=str)
        print("  Completion History: output/completion_history.json")

        # Save task plan
        with open("output/task_plan.json", "w") as f:
            json.dump(plan, f, indent=2)
        print("  Task Plan: output/task_plan.json")

        print("\n" + "="*60)
        print("‚ú® Orchestration completed successfully!")

    except KeyboardInterrupt:
        print("\n\n‚ö†Ô∏è Orchestration interrupted by user")
    except Exception as e:
        print(f"\n‚ùå FATAL ERROR: {e}")
        import traceback
        traceback.print_exc()

üöÄ Starting DAG Orchestrator with H2E CALIBRATION Mode
Goal: Create a comprehensive guide on implementing zero-trust security architecture in Kubernetes:
    1. Research current best practices and frameworks
    2. Write detailed implementation guide with examples
    3. Review and refine the content for technical accuracy
    4. Create a summary for executive audience
------------------------------------------------------------
üß† Brain: Planning via claude-opus-4-6...
üìä H2E/NEZ: Captured expert intent for task1 (Research Analyst, complexity: 0.54)
üìä H2E/NEZ: Captured expert intent for task2 (Technical Writer, complexity: 0.55)
üìä H2E/NEZ: Captured expert intent for task3 (Technical Reviewer, complexity: 0.57)
üìä H2E/NEZ: Captured expert intent for task4 (Technical Writer, complexity: 0.43)
üìä H2E/NEZ: Captured expert intent for task5 (Technical Writer, complexity: 0.52)
‚úì Planned 5 tasks
‚úì DAG validation passed (no circular dependencies)

üìã Task Plan:
  1. task

In [40]:
print("\nüìã Task Plan:")
for i, task in enumerate(plan['tasks'], 1):
    deps = ', '.join(task['dependencies']) if task['dependencies'] else 'None'
    complexity = task.get('estimated_complexity', 'medium')
    print(f"  {i}. {task['id']}: {task['description'][:100]}...")
    print(f"     Role: {task['agent_role']}, Complexity: {complexity.upper()}, Depends on: {deps}")

print("\n" + "="*60)


print("\n" + "="*60 + "\nüèÅ BUILD COMPLETE\n" + "="*60)

# Statistics
successful = sum(1 for t in orch.task_store.values() if t['status'] == TaskStatus.COMPLETED.value)
failed = sum(1 for t in orch.task_store.values() if t['status'] == TaskStatus.FAILED.value)

h2e_scores = [t.get('h2e_score', 0) for t in orch.task_store.values()
              if t.get('status') == TaskStatus.COMPLETED.value and 'h2e_score' in t]
h2e_raw_sims = [t.get('h2e_raw_similarity', 0) for t in orch.task_store.values()
                if t.get('status') == TaskStatus.COMPLETED.value and 'h2e_raw_similarity' in t]
avg_h2e = np.mean(h2e_scores) if h2e_scores else 0
avg_raw_sim = np.mean(h2e_raw_sims) if h2e_raw_sims else 0
h2e_passed = sum(1 for t in orch.task_store.values()
                if t.get('h2e_passed', False))

print(f"\nüìà Summary:")
print(f"   Tasks: {successful} successful, {failed} failed")

if CALIBRATION_MODE:
    print(f"   H2E Calibration Scores: Avg {avg_h2e:.3f} (Raw cosine: {avg_raw_sim:.3f})")
    # Analyze calibration data
    recommendations = orch.h2e.analyze_calibration_data()
    print(f"\nüí° Recommendations for production thresholds:")
    for role, threshold in recommendations.items():
        print(f"   {role}: {threshold:.3f}")
else:
    print(f"   H2E Alignment: {h2e_passed}/{successful} passed, Avg score: {avg_h2e:.3f}")

# Generate and display H2E report
print("\nüìä H2E Accountability Report:")
h2e_report = orch.h2e.save_report("output/h2e_accountability_report.json")

if 'h2e_framework_report' in h2e_report:
    report_data = h2e_report['h2e_framework_report']
    print(f"   Timestamp: {report_data.get('timestamp', 'N/A')}")
    print(f"   Tasks Evaluated: {report_data.get('total_tasks_evaluated', 0)}")
    print(f"   Average SROI: {report_data.get('average_sroi_score', 0):.3f}")
    print(f"   Average Raw Similarity: {report_data.get('average_raw_similarity', 0):.3f}")
    print(f"   Alignment Pass Rate: {report_data.get('alignment_pass_rate', '0%')}")
    print(f"   Recommended Threshold: {report_data.get('recommended_threshold', 0):.3f}")

    if 'task_breakdown' in report_data:
        print(f"\n   Task Breakdown:")
        for task in report_data['task_breakdown']:
            status = "‚úÖ" if task['passed'] else "‚ùå"
            complexity_star = "‚≠ê" * max(1, int(task.get('complexity', 0.5) * 3))
            print(f"     {status} {task['task_id']} ({task['role']}) {complexity_star}")
            print(f"        Score: {task['sroi_score']:.3f} (Raw: {task['raw_similarity']:.3f})")
            print(f"        Required: {task['required_threshold']:.3f}, Confidence: {task.get('confidence', 0.5):.2f}")
            print(f"        {task['explanation']}")

# Cost estimation
orch.estimate_cost()

# Save outputs
os.makedirs("output", exist_ok=True)

print("\nüìÇ Output Files:")
for tid, content in results.items():
    status = orch.task_store[tid]['status']
    role = orch.task_store[tid]['agent_role']
    h2e_score = orch.task_store[tid].get('h2e_score', 'N/A')
    h2e_raw = orch.task_store[tid].get('h2e_raw_similarity', 'N/A')

    clean_role = role.replace(' ', '_').replace('/', '_')
    filename = f"output/{tid}_{clean_role}.md"  # Using markdown extension

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"# Task: {tid}\n")
        f.write(f"**Role**: {role}\n")
        f.write(f"**Status**: {status}\n")
        f.write(f"**H2E Alignment Score**: {h2e_score} (Raw: {h2e_raw})\n")
        f.write(f"**H2E Confidence**: {orch.task_store[tid].get('h2e_confidence', 'N/A')}\n")
        f.write(f"**Generated**: {time.ctime()}\n")
        f.write(f"\n---\n\n")
        f.write(content)

    print(f"  {tid}: {filename} (H2E: {h2e_score}, Raw: {h2e_raw})")

# Save completion history
with open("output/completion_history.json", "w") as f:
    json.dump(orch.completion_history, f, indent=2, default=str)
print("  Completion History: output/completion_history.json")

# Save task plan
with open("output/task_plan.json", "w") as f:
    json.dump(plan, f, indent=2)
print("  Task Plan: output/task_plan.json")

print("\n" + "="*60)
print("‚ú® Orchestration completed successfully!")


üìã Task Plan:
  1. task1: Research current best practices, frameworks, and industry standards for zero-trust security architec...
     Role: Research Analyst, Complexity: HIGH, Depends on: None
  2. task2: Based on the research findings, write a detailed implementation guide for zero-trust security archit...
     Role: Technical Writer, Complexity: HIGH, Depends on: task1
  3. task3: Perform a thorough technical review of the implementation guide for accuracy, completeness, and secu...
     Role: Technical Reviewer, Complexity: HIGH, Depends on: task2
  4. task4: Incorporate all feedback and corrections from the technical review into the implementation guide. Re...
     Role: Technical Writer, Complexity: MEDIUM, Depends on: task3
  5. task5: Create a concise executive summary (2-3 pages) of the zero-trust Kubernetes security guide tailored ...
     Role: Technical Writer, Complexity: MEDIUM, Depends on: task4


üèÅ BUILD COMPLETE

üìà Summary:
   Tasks: 5 successful, 0 failed
   