# Module 5: AutoGen

## Applied AI Scientist Field Notes - Expanded Edition

---


## Module 5: AutoGen - Conversational Multi-Agent Systems

### Topics
1. Agent roles and personas
2. Conversational patterns
3. Code generation and execution
4. Error correction loops
5. Multi-agent collaboration
6. Human-in-the-loop

---

In [None]:
%pip install -q pyautogen

print('AutoGen installed!')

### Section 1: Multi-Agent Conversations

AutoGen pattern:
- **UserProxy**: Represents user, can execute code
- **AssistantAgent**: LLM-powered agent
- **Conversation loop**: Agents alternate until termination
- **Code execution**: Sandboxed Python execution
- **Error feedback**: Failed code triggers refinement

In [None]:
class Agent:
    '''Simple agent for multi-agent conversations'''
    
    def __init__(self, name: str, role: str, system_message: str):
        self.name = name
        self.role = role
        self.system_message = system_message
        self.history = []
    
    def respond(self, message: str) -> str:
        self.history.append({'role': 'user', 'content': message})
        
        # Mock responses based on role
        if self.role == 'coder':
            response = 'def fibonacci(n):\n    return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)'
        elif self.role == 'executor':
            response = 'Execution result: Success, output=55'
        elif self.role == 'reviewer':
            response = 'Code quality: Good. Tests passed.'
        else:
            response = f'{self.name}: Acknowledged'
        
        self.history.append({'role': 'assistant', 'content': response})
        return response

class MultiAgentSystem:
    def __init__(self):
        self.agents = {}
        self.log = []
    
    def add_agent(self, agent: Agent):
        self.agents[agent.name] = agent
    
    def run(self, task: str, max_turns=5):
        self.log = []
        current_msg = task
        
        for turn in range(max_turns):
            for name in self.agents.keys():
                agent = self.agents[name]
                response = agent.respond(current_msg)
                
                self.log.append({
                    'turn': turn + 1,
                    'agent': name,
                    'message': response[:80] + '...' if len(response) > 80 else response
                })
                
                current_msg = response
                
                if 'TERMINATE' in response or 'done' in response.lower():
                    return self.log
        
        return self.log

# Example: Coder-Executor-Reviewer workflow
print('Multi-Agent System Demo')
print('=' * 80)

coder = Agent('Coder', 'coder', 'Write code to solve problems')
executor = Agent('Executor', 'executor', 'Execute code and return results')
reviewer = Agent('Reviewer', 'reviewer', 'Review code quality')

mas = MultiAgentSystem()
mas.add_agent(coder)
mas.add_agent(executor)
mas.add_agent(reviewer)

task = 'Calculate fibonacci(10)'
conversation = mas.run(task, max_turns=2)

print(f'\nTask: {task}\n')
for entry in conversation:
    print(f'Turn {entry["turn"]} | {entry["agent"]}: {entry["message"]}')

### Section 2: Code Execution with Safety

AutoGen's power comes from code execution, but this requires careful sandboxing:
- **Docker containers**: Isolated execution environment
- **Resource limits**: CPU, memory, time constraints
- **Network isolation**: Prevent data exfiltration
- **Filesystem restrictions**: Read-only mounts
- **Allowlist**: Only approved libraries

In [None]:
import subprocess
import tempfile
import os
import signal
from typing import Dict, Tuple
import json

class SafeCodeExecutor:
    '''Secure code execution sandbox'''
    
    def __init__(self, timeout_seconds=5, max_memory_mb=512):
        self.timeout = timeout_seconds
        self.max_memory = max_memory_mb * 1024 * 1024  # Convert to bytes
        
        # Allowed imports
        self.allowed_imports = {
            'math', 'random', 'datetime', 'json', 'collections',
            'itertools', 'functools', 're', 'string', 'typing'
        }
        
        # Blocked operations
        self.blocked_patterns = [
            'import os', 'import sys', 'import subprocess',
            'exec(', 'eval(', 'compile(',
            '__import__', 'open(', 'file(',
            'socket', 'urllib', 'requests',
        ]
    
    def validate_code(self, code: str) -> Tuple[bool, str]:
        '''Pre-execution code validation'''
        code_lower = code.lower()
        
        # Check for blocked patterns
        for pattern in self.blocked_patterns:
            if pattern in code_lower:
                return False, f'Blocked operation: {pattern}'
        
        # Check imports
        import re
        imports = re.findall(r'import (\w+)', code)
        imports.extend(re.findall(r'from (\w+) import', code))
        
        for imp in imports:
            if imp not in self.allowed_imports:
                return False, f'Disallowed import: {imp}'
        
        return True, 'Valid'
    
    def execute(self, code: str) -> Dict[str, Any]:
        '''Execute code in sandbox'''
        
        # Validate first
        is_valid, message = self.validate_code(code)
        if not is_valid:
            return {
                'status': 'blocked',
                'error': message,
                'output': None
            }
        
        # Create temporary file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
            f.write(code)
            temp_file = f.name
        
        try:
            # Execute with resource limits
            # In production, use Docker or similar container isolation
            result = subprocess.run(
                ['python3', temp_file],
                capture_output=True,
                text=True,
                timeout=self.timeout,
                # Resource limits (Linux only)
                preexec_fn=self._set_resource_limits if os.name == 'posix' else None
            )
            
            return {
                'status': 'success' if result.returncode == 0 else 'error',
                'output': result.stdout,
                'error': result.stderr if result.returncode != 0 else None,
                'returncode': result.returncode
            }
            
        except subprocess.TimeoutExpired:
            return {
                'status': 'timeout',
                'error': f'Execution exceeded {self.timeout}s timeout',
                'output': None
            }
        
        except Exception as e:
            return {
                'status': 'error',
                'error': str(e),
                'output': None
            }
        
        finally:
            # Cleanup
            try:
                os.unlink(temp_file)
            except:
                pass
    
    def _set_resource_limits(self):
        '''Set resource limits for subprocess (Linux)'''
        try:
            import resource
            
            # Memory limit
            resource.setrlimit(resource.RLIMIT_AS, (self.max_memory, self.max_memory))
            
            # CPU time limit
            resource.setrlimit(resource.RLIMIT_CPU, (self.timeout, self.timeout))
            
            # No file creation
            resource.setrlimit(resource.RLIMIT_FSIZE, (0, 0))
            
        except ImportError:
            pass  # Not on Linux

# Test safe executor
print('SAFE CODE EXECUTION DEMONSTRATION')
print('=' * 90)

executor = SafeCodeExecutor(timeout_seconds=2, max_memory_mb=128)

test_cases = [
    # Safe code
    '''
import math
result = math.sqrt(16)
print(f"Square root of 16: {result}")
    ''',
    
    # Blocked import
    '''
import os
print(os.listdir('/'))
    ''',
    
    # Blocked operation
    '''
eval("print('hello')")
    ''',
    
    # Timeout
    '''
import time
time.sleep(10)
print('done')
    ''',
]

for i, code in enumerate(test_cases, 1):
    print(f'\nTest {i}:')
    print(f'Code: {code.strip()[:60]}...')
    
    result = executor.execute(code)
    
    print(f'Status: {result["status"]}')
    if result['output']:
        print(f'Output: {result["output"].strip()}')
    if result['error']:
        print(f'Error: {result["error"][:80]}...')
    print('-' * 90)

print('\n' + '=' * 90)
print('KEY SECURITY PRINCIPLES:')
print('  - Never execute arbitrary code without validation')
print('  - Use containerization (Docker) for true isolation')
print('  - Set strict resource limits (memory, CPU, time)')
print('  - Allowlist imports, block dangerous operations')
print('  - Network isolation to prevent exfiltration')
print('  - Audit log all code execution attempts')

### Section 3: Error Correction Loop

AutoGen excels at iterative refinement:
1. **Coder** generates code
2. **Executor** runs code, captures errors
3. **Coder** sees error, fixes code
4. Repeat until success or max attempts

In [None]:
class CodeGenerationAgent:
    '''Agent that generates and refines code'''
    
    def __init__(self, llm_func: Callable):
        self.llm = llm_func
        self.generation_history = []
    
    def generate(self, task: str, error_feedback: str = None) -> str:
        '''Generate or refine code based on feedback'''
        
        if error_feedback is None:
            # Initial generation
            prompt = f'''Write Python code to accomplish this task:
{task}

Provide working code with proper error handling.'''
        else:
            # Refinement based on error
            prompt = f'''The previous code failed with this error:
{error_feedback}

Original task: {task}

Fix the code to handle this error. Provide the complete corrected code.'''
        
        code = self.llm(prompt)
        self.generation_history.append(code)
        
        return code

class ErrorCorrectionLoop:
    '''Iterative code generation with error correction'''
    
    def __init__(self, coder: CodeGenerationAgent, executor: SafeCodeExecutor, max_iterations=5):
        self.coder = coder
        self.executor = executor
        self.max_iterations = max_iterations
        self.execution_log = []
    
    def run(self, task: str) -> Dict[str, Any]:
        '''Execute error correction loop'''
        error_feedback = None
        
        for iteration in range(self.max_iterations):
            print(f'\nIteration {iteration + 1}/{self.max_iterations}')
            print('-' * 70)
            
            # Generate code
            code = self.coder.generate(task, error_feedback)
            print(f'Generated code ({len(code)} chars)')
            
            # Execute
            exec_result = self.executor.execute(code)
            
            self.execution_log.append({
                'iteration': iteration + 1,
                'code': code,
                'exec_result': exec_result,
                'timestamp': datetime.utcnow().isoformat()
            })
            
            if exec_result['status'] == 'success':
                print(f'✓ Execution successful!')
                print(f'Output: {exec_result["output"]}')
                return {
                    'status': 'success',
                    'code': code,
                    'output': exec_result['output'],
                    'iterations': iteration + 1,
                    'execution_log': self.execution_log
                }
            
            elif exec_result['status'] == 'blocked':
                print(f'✗ Code blocked: {exec_result["error"]}')
                error_feedback = exec_result['error']
            
            else:
                print(f'✗ Execution failed: {exec_result["error"][:100]}')
                error_feedback = exec_result['error']
        
        # Max iterations reached
        return {
            'status': 'max_iterations',
            'code': self.coder.generation_history[-1] if self.coder.generation_history else None,
            'error': 'Failed to generate working code after maximum iterations',
            'iterations': self.max_iterations,
            'execution_log': self.execution_log
        }

# Mock LLM for code generation
def mock_code_llm(prompt: str) -> str:
    '''Mock LLM that generates code (sometimes with bugs)'''
    import random
    
    if 'fibonacci' in prompt.lower():
        if 'error' in prompt.lower() or 'fix' in prompt.lower():
            # Fixed version
            return '''def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)

result = fibonacci(10)
print(f"Fibonacci(10) = {result}")'''
        else:
            # Buggy version
            return '''def fibonacci(n):
    return fibonacci(n-1) + fibonacci(n-2)  # Missing base case!

result = fibonacci(10)
print(f"Result: {result}")'''
    
    return 'print("Hello, World!")'

# Demo error correction loop
print('\nERROR CORRECTION LOOP DEMONSTRATION')
print('=' * 90)

coder = CodeGenerationAgent(mock_code_llm)
executor = SafeCodeExecutor(timeout_seconds=2)
loop = ErrorCorrectionLoop(coder, executor, max_iterations=3)

task = 'Write a function to calculate the 10th Fibonacci number and print the result'
result = loop.run(task)

print('\n' + '=' * 90)
print(f'Final status: {result["status"]}')
print(f'Iterations required: {result["iterations"]}')
if result['status'] == 'success':
    print(f'Final output: {result["output"]}')

print('\n' + '=' * 90)
print('KEY BENEFITS OF ERROR CORRECTION:')
print('  - Automatically fixes common bugs (syntax, logic, runtime)')
print('  - No human intervention needed for ~80% of errors')
print('  - Learns from failures within the conversation')
print('  - Typical success rate: 70-90% within 3 iterations')

### Section 4: Advanced Multi-Agent Orchestration

Complex multi-agent patterns:
- **Sequential**: Agents execute in order
- **Parallel**: Multiple agents work simultaneously
- **Hierarchical**: Manager delegates to specialists
- **Group chat**: All agents can contribute
- **Debate**: Agents argue until consensus

In [None]:
from typing import List, Dict, Callable
from collections import defaultdict
import time

class AdvancedAgent:
    '''Agent with expertise and voting capability'''
    
    def __init__(self, name: str, role: str, expertise: List[str], llm_func: Callable):
        self.name = name
        self.role = role
        self.expertise = expertise
        self.llm = llm_func
        self.conversation_history = []
        self.contributions = 0
    
    def should_respond(self, message: str) -> bool:
        '''Decide if agent should respond based on expertise'''
        # Check if message relates to agent's expertise
        return any(exp.lower() in message.lower() for exp in self.expertise)
    
    def respond(self, message: str, conversation_context: List[dict]) -> str:
        '''Generate response based on conversation context'''
        # Build prompt with context
        context_str = '\n'.join([f"{m['agent']}: {m['message']}" for m in conversation_context[-5:]])
        
        prompt = f'''You are {self.name}, a {self.role}.
Your expertise: {', '.join(self.expertise)}

Conversation so far:
{context_str}

New message: {message}

Your response:'''
        
        response = self.llm(prompt)
        self.contributions += 1
        
        return response
    
    def vote(self, proposal: str) -> Tuple[bool, str]:
        '''Vote on a proposal'''
        # In production, LLM would evaluate proposal
        # For demo, random vote
        import random
        approve = random.random() > 0.3
        reason = 'Looks good' if approve else 'Needs improvement'
        return approve, reason

class GroupChatOrchestrator:
    '''Orchestrate group chat between multiple agents'''
    
    def __init__(self, agents: List[AdvancedAgent], selection_mode='auto'):
        self.agents = agents
        self.selection_mode = selection_mode
        self.conversation_log = []
    
    def select_next_speaker(self, current_message: str) -> AdvancedAgent:
        '''Select which agent should respond next'''
        
        if self.selection_mode == 'round_robin':
            # Each agent speaks in turn
            return self.agents[len(self.conversation_log) % len(self.agents)]
        
        elif self.selection_mode == 'auto':
            # Agent with relevant expertise responds
            candidates = [a for a in self.agents if a.should_respond(current_message)]
            
            if candidates:
                # Select agent with most contributions (balance participation)
                return min(candidates, key=lambda a: a.contributions)
            else:
                # Default to first agent
                return self.agents[0]
        
        elif self.selection_mode == 'all':
            # All agents respond (parallel)
            return None  # Special case
    
    def run_discussion(self, initial_task: str, max_turns=10) -> Dict[str, Any]:
        '''Run multi-agent discussion'''
        
        current_message = initial_task
        self.conversation_log = []
        
        for turn in range(max_turns):
            # Select speaker
            speaker = self.select_next_speaker(current_message)
            
            if speaker is None:  # All agents respond
                responses = []
                for agent in self.agents:
                    response = agent.respond(current_message, self.conversation_log)
                    responses.append({
                        'agent': agent.name,
                        'role': agent.role,
                        'message': response
                    })
                    self.conversation_log.append(responses[-1])
                
                # Synthesize responses
                current_message = self._synthesize_responses(responses)
            
            else:  # Single agent responds
                response = speaker.respond(current_message, self.conversation_log)
                
                self.conversation_log.append({
                    'turn': turn + 1,
                    'agent': speaker.name,
                    'role': speaker.role,
                    'message': response
                })
                
                current_message = response
            
            # Check for termination
            if any(keyword in current_message.lower() for keyword in ['terminate', 'complete', 'done', 'finished']):
                break
        
        return {
            'conversation': self.conversation_log,
            'turns': len(self.conversation_log),
            'final_message': current_message
        }
    
    def _synthesize_responses(self, responses: List[dict]) -> str:
        '''Combine multiple agent responses'''
        synthesis = 'Multiple perspectives:\n'
        for r in responses:
            synthesis += f"- {r['agent']}: {r['message'][:60]}...\n"
        return synthesis

class ConsensusOrchestrator:
    '''Reach consensus through voting'''
    
    def __init__(self, agents: List[AdvancedAgent]):
        self.agents = agents
    
    def reach_consensus(self, proposal: str, required_majority: float = 0.6) -> Dict[str, Any]:
        '''Get agent votes and determine consensus'''
        votes = []
        
        print(f'\nProposal: {proposal[:80]}...')
        print('-' * 70)
        
        for agent in self.agents:
            approve, reason = agent.vote(proposal)
            votes.append({
                'agent': agent.name,
                'approve': approve,
                'reason': reason
            })
            
            vote_str = '✓ Approve' if approve else '✗ Reject'
            print(f'{agent.name}: {vote_str} - {reason}')
        
        # Calculate results
        approval_count = sum(1 for v in votes if v['approve'])
        approval_rate = approval_count / len(votes)
        
        consensus_reached = approval_rate >= required_majority
        
        print(f'\nVotes: {approval_count}/{len(votes)} ({approval_rate:.0%})')
        print(f'Consensus: {"REACHED" if consensus_reached else "NOT REACHED"}')
        
        return {
            'consensus': consensus_reached,
            'approval_rate': approval_rate,
            'votes': votes
        }

# Mock LLM for agents
def mock_agent_llm(prompt: str) -> str:
    '''Mock LLM for agent responses'''
    if 'architect' in prompt.lower():
        return 'We should use microservices architecture for scalability.'
    elif 'security' in prompt.lower():
        return 'Add authentication and encryption at all layers.'
    elif 'devops' in prompt.lower():
        return 'Deploy using Kubernetes with auto-scaling.'
    return 'I agree with the approach.'

# Demo: Group chat
print('\nMULTI-AGENT GROUP CHAT DEMONSTRATION')
print('=' * 90)

agents = [
    AdvancedAgent('Alice', 'System Architect', ['architecture', 'design', 'scalability'], mock_agent_llm),
    AdvancedAgent('Bob', 'Security Engineer', ['security', 'authentication', 'encryption'], mock_agent_llm),
    AdvancedAgent('Carol', 'DevOps Engineer', ['deployment', 'infrastructure', 'kubernetes'], mock_agent_llm),
]

orchestrator = GroupChatOrchestrator(agents, selection_mode='auto')

task = 'Design a secure and scalable microservices architecture'
result = orchestrator.run_discussion(task, max_turns=6)

print(f'\nTask: {task}')
print(f'Turns: {result["turns"]}\n')
for entry in result['conversation'][:5]:  # Show first 5
    print(f"Turn {entry.get('turn', '?')}: {entry['agent']} ({entry['role']})")
    print(f"  {entry['message'][:80]}...\n")

print('\n' + '=' * 90)

# Demo: Consensus voting
print('\nCONSENSUS VOTING DEMONSTRATION')
print('=' * 90)

consensus_orch = ConsensusOrchestrator(agents)
proposal = 'Deploy the system using serverless functions instead of containers'
result = consensus_orch.reach_consensus(proposal, required_majority=0.67)

if result['consensus']:
    print('\n→ Proceeding with proposal')
else:
    print('\n→ Proposal rejected, needs revision')

## Interview Questions: AutoGen Production Systems

### For Senior/Staff Engineers

In [None]:
autogen_interview_questions = [
    {
        'level': 'Senior',
        'question': 'Your AutoGen code generation system has a 30% success rate on the first attempt, 60% after 2 retries, and 80% after 3 retries. However, each retry costs $0.05 and takes 3 seconds. A competitor offers 90% success but costs $0.30 per request. Which should you use and why?',
        'answer': '''
**Cost-Benefit Analysis:**

**Option A: AutoGen with retries**
- Attempt 1: 30% success @ $0.05 = $0.015 average
- Attempt 2: 30% more succeed @ $0.10 = $0.030 average (of the 70% that try)
- Attempt 3: 20% more succeed @ $0.15 = $0.030 average (of the 40% that try)

**Expected cost calculation:**
```python
def calculate_expected_cost():
    cost_per_attempt = 0.05
    
    # Expected cost = sum(P(need N attempts) * cost_of_N_attempts)
    expected_cost = (
        0.30 * (1 * cost_per_attempt) +  # 30% succeed on attempt 1
        0.30 * (2 * cost_per_attempt) +  # 30% succeed on attempt 2
        0.20 * (3 * cost_per_attempt) +  # 20% succeed on attempt 3
        0.20 * (3 * cost_per_attempt)    # 20% fail after 3 attempts
    )
    
    return expected_cost

expected_cost = calculate_expected_cost()
print(f"Expected cost per request: ${expected_cost:.3f}")
# Result: $0.105 per request

expected_latency = (
    0.30 * 3 +    # 30% done in 3s
    0.30 * 6 +    # 30% done in 6s
    0.40 * 9      # 40% done in 9s (includes 20% failures)
)
print(f"Expected latency: {expected_latency:.1f}s")
# Result: 6.3s average
```

**Option B: Competitor**
- Success rate: 90%
- Cost: $0.30
- Latency: ~3s (assumed)

**Decision Matrix:**

| Metric | AutoGen (3 retries) | Competitor | Winner |
|--------|---------------------|------------|--------|
| Success Rate | 80% | 90% | Competitor (+10%) |
| Cost per success | $0.131 | $0.30 | AutoGen (58% cheaper) |
| Latency (avg) | 6.3s | 3.0s | Competitor (52% faster) |
| Latency (P95) | 9.0s | 3.0s | Competitor (67% faster) |

**Recommendation depends on use case:**

**Choose AutoGen IF:**
- Cost is primary concern (high volume, tight budget)
- Latency is acceptable (batch processing, async workflows)
- 80% success rate meets requirements
- Can implement result caching (improves effective success rate)

**Choose Competitor IF:**
- Latency critical (real-time applications)
- 80% success rate too low (high-stakes domain)
- User-facing (poor UX with retries)
- Simpler ops (no retry logic needed)

**Hybrid Solution (Best of Both):**
```python
class HybridCodeGen:
    '''Use AutoGen with fallback to competitor'''
    
    def __init__(self):
        self.autogen = AutoGenSystem(max_retries=2)  # Try twice
        self.competitor = CompetitorAPI()
        self.cache = ResponseCache()
    
    async def generate(self, task: str) -> dict:
        # Check cache first
        cached = self.cache.get(task)
        if cached:
            return {'source': 'cache', 'result': cached, 'cost': 0, 'latency_ms': 5}
        
        # Try AutoGen first (cheaper)
        start = time.time()
        result = await self.autogen.generate(task)
        
        if result['status'] == 'success':
            latency = (time.time() - start) * 1000
            self.cache.set(task, result['code'])
            return {
                'source': 'autogen',
                'result': result['code'],
                'cost': result['cost'],
                'latency_ms': latency
            }
        
        # Fallback to competitor (more reliable but expensive)
        print('AutoGen failed, falling back to competitor')
        result = await self.competitor.generate(task)
        
        latency = (time.time() - start) * 1000
        self.cache.set(task, result)
        
        return {
            'source': 'competitor',
            'result': result,
            'cost': 0.30,
            'latency_ms': latency
        }

# Expected metrics with hybrid:
hybrid_metrics = {
    'success_rate': 0.80 + (0.20 * 0.90),  # 80% from AutoGen + 90% of remaining 20%
    # = 98% total
    
    'cost_per_request': 0.80 * 0.105 + 0.20 * 0.30,  # Weighted average
    # = $0.144 (52% cheaper than competitor, slightly more than pure AutoGen)
    
    'latency_avg': 0.80 * 6.3 + 0.20 * 9.0,  # AutoGen latency + fallback
    # = 6.84s
}

print('\nCOST-BENEFIT ANALYSIS SUMMARY:')
print('=' * 80)
print(f"Pure AutoGen: 80% success, $0.105 avg, 6.3s latency")
print(f"Competitor:   90% success, $0.300 avg, 3.0s latency")
print(f"Hybrid:       98% success, $0.144 avg, 6.8s latency")
print('=' * 80)
print('\nRECOMMENDATION: Hybrid approach')
print('  - Best success rate (98%)')
print('  - Reasonable cost ($0.144)')
print('  - Acceptable latency for most use cases')
print('  - Can add caching for further improvement')
```
        ''',
    },
    {
        'level': 'Staff',
        'question': 'Design a multi-agent system for automated code review that checks style, security, performance, and tests. Include consensus mechanisms, escalation logic, and integration with CI/CD.',
        'answer': '''
**Automated Code Review Multi-Agent System:**

**1. Agent Team Structure:**
```python
@dataclass
class ReviewAgent:
    name: str
    role: str
    focus_areas: List[str]
    severity_thresholds: Dict[str, int]  # What issues require human escalation

# Define specialist agents
agents = [
    ReviewAgent(
        name='StyleBot',
        role='Code Style Reviewer',
        focus_areas=['formatting', 'naming', 'documentation'],
        severity_thresholds={'critical': 0, 'high': 5, 'medium': 10}
    ),
    ReviewAgent(
        name='SecurityBot',
        role='Security Analyst',
        focus_areas=['vulnerabilities', 'injection', 'authentication', 'secrets'],
        severity_thresholds={'critical': 0, 'high': 1, 'medium': 3}
    ),
    ReviewAgent(
        name='PerformanceBot',
        role='Performance Specialist',
        focus_areas=['complexity', 'memory', 'database', 'caching'],
        severity_thresholds={'critical': 0, 'high': 2, 'medium': 5}
    ),
    ReviewAgent(
        name='TestBot',
        role='Test Coverage Analyst',
        focus_areas=['coverage', 'edge_cases', 'integration_tests'],
        severity_thresholds={'critical': 0, 'high': 3, 'medium': 7}
    ),
]
```

**2. Review Orchestration:**
```python
from typing import List, Dict
from enum import Enum

class ReviewSeverity(Enum):
    CRITICAL = 'critical'
    HIGH = 'high'
    MEDIUM = 'medium'
    LOW = 'low'
    INFO = 'info'

class ReviewFinding:
    def __init__(self, agent: str, severity: ReviewSeverity, message: str, 
                 file: str, line: int, suggestion: str = None):
        self.agent = agent
        self.severity = severity
        self.message = message
        self.file = file
        self.line = line
        self.suggestion = suggestion

class CodeReviewOrchestrator:
    '''Orchestrate multi-agent code review'''
    
    def __init__(self, agents: List[ReviewAgent]):
        self.agents = agents
        self.executor = CodeExecutor()
    
    async def review_pr(self, pr_data: dict) -> dict:
        '''Review pull request with all agents in parallel'''
        
        # Step 1: Parallel review by all agents
        review_tasks = [
            self._agent_review(agent, pr_data)
            for agent in self.agents
        ]
        
        agent_findings = await asyncio.gather(*review_tasks)
        
        # Step 2: Aggregate findings
        all_findings = [f for findings in agent_findings for f in findings]
        
        # Step 3: Detect conflicts between agents
        conflicts = self._detect_conflicts(agent_findings)
        
        # Step 4: Consensus on approval
        decision = await self._make_decision(all_findings, conflicts)
        
        # Step 5: Check escalation
        needs_human = self._check_escalation(all_findings, conflicts)
        
        return {
            'pr_id': pr_data['pr_id'],
            'decision': decision,
            'findings': all_findings,
            'conflicts': conflicts,
            'needs_human_review': needs_human,
            'review_time_ms': 0  # Track actual time
        }
    
    async def _agent_review(self, agent: ReviewAgent, pr_data: dict) -> List[ReviewFinding]:
        '''Single agent performs review'''
        findings = []
        
        # Run agent-specific checks
        if agent.role == 'Security Analyst':
            # Security checks
            security_findings = await self._run_security_analysis(pr_data)
            findings.extend(security_findings)
        
        elif agent.role == 'Performance Specialist':
            # Performance checks
            perf_findings = await self._run_performance_analysis(pr_data)
            findings.extend(perf_findings)
        
        # ... other agents
        
        return findings
    
    async def _run_security_analysis(self, pr_data: dict) -> List[ReviewFinding]:
        '''Run security analysis'''
        findings = []
        
        # Static analysis
        code = pr_data['diff']
        
        # Check for common vulnerabilities
        if 'eval(' in code:
            findings.append(ReviewFinding(
                agent='SecurityBot',
                severity=ReviewSeverity.CRITICAL,
                message='Use of eval() detected - code injection risk',
                file=pr_data['file'],
                line=0,
                suggestion='Use ast.literal_eval() or json.loads() instead'
            ))
        
        if 'password' in code.lower() and '=' in code:
            findings.append(ReviewFinding(
                agent='SecurityBot',
                severity=ReviewSeverity.HIGH,
                message='Possible hardcoded password detected',
                file=pr_data['file'],
                line=0,
                suggestion='Use environment variables or secrets manager'
            ))
        
        return findings
    
    def _detect_conflicts(self, agent_findings: List[List[ReviewFinding]]) -> List[dict]:
        '''Detect conflicting recommendations'''
        # E.g., Performance says "cache everything", Security says "don't cache PII"
        conflicts = []
        
        # Simplified conflict detection
        for i, findings_a in enumerate(agent_findings):
            for j, findings_b in enumerate(agent_findings[i+1:], i+1):
                # Check if recommendations contradict
                # (In production, use NLP to detect semantic conflicts)
                pass
        
        return conflicts
    
    async def _make_decision(self, findings: List[ReviewFinding], conflicts: List[dict]) -> str:
        '''Decide: approve, reject, or request changes'''
        
        # Count by severity
        severity_counts = defaultdict(int)
        for f in findings:
            severity_counts[f.severity] += 1
        
        # Decision logic
        if severity_counts[ReviewSeverity.CRITICAL] > 0:
            return 'reject'  # Any critical issue = reject
        
        if severity_counts[ReviewSeverity.HIGH] > 3:
            return 'reject'  # Too many high-severity issues
        
        if len(conflicts) > 0:
            return 'request_changes'  # Conflicts need resolution
        
        if severity_counts[ReviewSeverity.HIGH] > 0 or severity_counts[ReviewSeverity.MEDIUM] > 5:
            return 'request_changes'
        
        return 'approve'
    
    def _check_escalation(self, findings: List[ReviewFinding], conflicts: List[dict]) -> bool:
        '''Determine if human review required'''
        
        # Escalate if:
        # 1. Any critical security issue
        if any(f.severity == ReviewSeverity.CRITICAL and 'Security' in f.agent for f in findings):
            return True
        
        # 2. Conflicting recommendations from agents
        if len(conflicts) > 2:
            return True
        
        # 3. Unusual patterns (ML-based anomaly detection)
        # if self.anomaly_detector.is_anomalous(pr_data):
        #     return True
        
        return False
```

**3. Integration with CI/CD:**
```python
class CICDIntegration:
    '''Integrate with GitHub/GitLab CI/CD pipeline'''
    
    def __init__(self, code_review_system: CodeReviewOrchestrator):
        self.reviewer = code_review_system
    
    async def on_pr_created(self, pr_webhook: dict):
        '''Handle PR creation webhook'''
        pr_id = pr_webhook['pr_id']
        
        # Start review (async)
        review_task = asyncio.create_task(
            self.reviewer.review_pr(pr_webhook)
        )
        
        # Post initial comment
        await self.post_comment(pr_id, 'Code review in progress...')
        
        # Wait for review
        review_result = await review_task
        
        # Post findings
        await self.post_review_results(pr_id, review_result)
        
        # Update PR status
        if review_result['needs_human_review']:
            await self.request_human_review(pr_id)
        elif review_result['decision'] == 'approve':
            await self.approve_pr(pr_id)
        else:
            await self.request_changes(pr_id, review_result['findings'])
    
    async def post_review_results(self, pr_id: str, review: dict):
        '''Format and post review as PR comment'''
        
        # Group findings by severity
        by_severity = defaultdict(list)
        for finding in review['findings']:
            by_severity[finding.severity.value].append(finding)
        
        # Build markdown comment
        comment = f'''## Automated Code Review Results

**Decision: {review['decision'].upper()}**

'''        
        for severity in ['critical', 'high', 'medium', 'low']:
            if severity in by_severity:
                icon = {'critical': '🚨', 'high': '⚠️', 'medium': '💡', 'low': 'ℹ️'}[severity]
                comment += f'''\n### {icon} {severity.capitalize()} Issues ({len(by_severity[severity])})

'''
                
                for finding in by_severity[severity][:10]:  # Show top 10
                    comment += f'''**{finding.agent}**: {finding.message}
'''
                    if finding.suggestion:
                        comment += f'''  *Suggestion:* {finding.suggestion}
'''
                    comment += '\n'
        
        if review['needs_human_review']:
            comment += '''\n---
**⚠️ Human review required** due to critical issues or conflicts.
'''
        
        # Post to GitHub API
        await github_api.post_comment(pr_id, comment)
```

**4. Caching and Performance:**
```python
class IntelligentCache:
    '''Cache code review results'''
    
    def __init__(self):
        self.cache = {}
        self.hit_count = 0
        self.miss_count = 0
    
    def get_cache_key(self, pr_data: dict) -> str:
        '''Generate cache key from code hash'''
        import hashlib
        
        # Hash the diff
        diff_hash = hashlib.sha256(pr_data['diff'].encode()).hexdigest()
        
        # Include relevant metadata
        key = f"{diff_hash}_{pr_data['language']}_{pr_data['framework']}"
        
        return key
    
    def get(self, pr_data: dict) -> Optional[dict]:
        '''Get cached review if available'''
        key = self.get_cache_key(pr_data)
        
        if key in self.cache:
            self.hit_count += 1
            return self.cache[key]
        
        self.miss_count += 1
        return None
    
    def set(self, pr_data: dict, review_result: dict):
        '''Cache review result'''
        key = self.get_cache_key(pr_data)
        self.cache[key] = review_result
    
    def get_hit_rate(self) -> float:
        total = self.hit_count + self.miss_count
        return self.hit_count / total if total > 0 else 0
```

**5. Production Metrics:**
```python
class ReviewMetrics:
    '''Track review system performance'''
    
    def __init__(self):
        self.metrics = {
            'reviews_completed': 0,
            'auto_approved': 0,
            'requested_changes': 0,
            'rejected': 0,
            'escalated_to_human': 0,
            'avg_review_time_ms': [],
            'findings_by_severity': defaultdict(int),
        }
    
    def record_review(self, review_result: dict):
        self.metrics['reviews_completed'] += 1
        
        if review_result['decision'] == 'approve':
            self.metrics['auto_approved'] += 1
        elif review_result['decision'] == 'reject':
            self.metrics['rejected'] += 1
        else:
            self.metrics['requested_changes'] += 1
        
        if review_result['needs_human_review']:
            self.metrics['escalated_to_human'] += 1
        
        # Track findings
        for finding in review_result['findings']:
            self.metrics['findings_by_severity'][finding.severity.value] += 1
    
    def get_summary(self) -> dict:
        return {
            'total_reviews': self.metrics['reviews_completed'],
            'auto_approval_rate': self.metrics['auto_approved'] / self.metrics['reviews_completed'] if self.metrics['reviews_completed'] > 0 else 0,
            'escalation_rate': self.metrics['escalated_to_human'] / self.metrics['reviews_completed'] if self.metrics['reviews_completed'] > 0 else 0,
            'avg_review_time_sec': np.mean(self.metrics['avg_review_time_ms']) / 1000 if self.metrics['avg_review_time_ms'] else 0,
            'findings_by_severity': dict(self.metrics['findings_by_severity']),
        }
```

**6. Complete System Flow:**
```
1. PR Created → Webhook
2. Extract diff, files changed, language
3. Parallel agent reviews (4 agents × 10s = 10s total)
4. Aggregate findings
5. Detect conflicts (2s)
6. Make decision (1s)
7. Check escalation criteria
8. Post results to PR (1s)
9. If approved → merge, else → block

Total time: ~15s for complete review
```

**7. Consensus Mechanisms:**
```python
class ReviewConsensus:
    '''Handle conflicts between agents'''
    
    def resolve_conflict(self, conflict: dict) -> dict:
        '''Resolve conflicting recommendations'''
        
        agent_a = conflict['agent_a']
        agent_b = conflict['agent_b']
        issue = conflict['issue']
        
        # Priority rules
        priority = {
            'SecurityBot': 10,  # Security has highest priority
            'PerformanceBot': 7,
            'TestBot': 5,
            'StyleBot': 3,
        }
        
        # Higher priority agent wins
        if priority[agent_a['name']] > priority[agent_b['name']]:
            return {'winner': agent_a, 'reason': 'higher_priority'}
        else:
            return {'winner': agent_b, 'reason': 'higher_priority'}
```

**Expected Results:**
- **Review time**: 10-20 seconds (vs. hours for human)
- **Accuracy**: 85-90% (catches most common issues)
- **False positive rate**: 10-15% (acceptable with human escalation)
- **Cost**: $0.10-0.20 per review
- **Developer satisfaction**: High (fast feedback, fewer bike-shedding comments)

**Production Deployment:**
```yaml
# .github/workflows/ai-code-review.yml
name: AI Code Review

on:
  pull_request:
    types: [opened, synchronize]

jobs:
  ai-review:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v2
      
      - name: Run AI Code Review
        run: |
          python ai_review_system.py \\
            --pr-id ${{ github.event.pull_request.number }} \\
            --base ${{ github.event.pull_request.base.sha }} \\
            --head ${{ github.event.pull_request.head.sha }}
      
      - name: Post Results
        uses: actions/github-script@v6
        with:
          script: |
            const fs = require('fs');
            const review = JSON.parse(fs.readFileSync('review_result.json'));
            
            // Post review comment
            await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
              body: review.formatted_comment
            });
            
            // Update PR status
            if (review.needs_human_review) {
              await github.rest.pulls.requestReviewers({
                pull_number: context.issue.number,
                reviewers: ['senior-engineer']
              });
            }
```

**Key Benefits:**
1. **Fast feedback**: 15s vs. hours/days
2. **Consistent**: Same criteria every time
3. **Comprehensive**: Checks style, security, performance, tests in parallel
4. **Cost-effective**: $0.15 per review vs. $50 human-hour
5. **Scalable**: Handle 1000s of PRs/day
6. **Learning**: Improves over time with feedback
        ''',
    },
]

for i, qa in enumerate(autogen_interview_questions, 1):
    print(f'\n{'=' * 100}')
    print(f'Q{i} [{qa["level"]} Level]')
    print('=' * 100)
    print(f'\n{qa["question"]}\n')
    print('ANSWER:')
    print(qa['answer'])
    print()

## Module 5 Summary and Key Takeaways

### AutoGen for Production

AutoGen is powerful for conversational, iterative workflows, especially code generation.

In [None]:
print('MODULE 5: AUTOGEN - KEY TAKEAWAYS')
print('=' * 100)

summary = {
    '1. Core Strengths': [
        'Conversational multi-agent coordination',
        'Code generation with automatic error correction',
        'Iterative refinement through feedback loops',
        '70-80% success rate with 3 retries',
        'Best for: coding tasks, data analysis, test generation',
    ],
    '2. Code Execution Safety': [
        'Never execute arbitrary code without validation',
        'Use containerization (Docker) for true isolation',
        'Set resource limits: memory, CPU, time, network',
        'Allowlist imports, block dangerous operations (eval, exec, os, subprocess)',
        'Audit log all execution attempts',
        'Implement idempotency checks for retries',
    ],
    '3. Error Correction': [
        'Success rates: 30% → 60% → 80% with 3 attempts',
        'Cost: $0.05 per attempt (manage budget with max_retries)',
        'Latency: 3s per attempt (use async for better UX)',
        'Hybrid approach: AutoGen + fallback = 98% success',
        'Cache successful code for common patterns',
    ],
    '4. Multi-Agent Patterns': [
        'Sequential: Coder → Executor → Reviewer (single workflow)',
        'Parallel: Multiple specialists analyze simultaneously',
        'Group Chat: Dynamic selection based on expertise',
        'Consensus: Voting mechanism for decisions',
        'Hierarchical: Manager delegates to specialists',
    ],
    '5. Production Considerations': [
        'Termination conditions: max turns, success signal, timeout',
        'Agent selection: round-robin, expertise-based, dynamic',
        'Conversation memory: Limited to recent turns (token budget)',
        'Cost control: Monitor token usage per conversation',
        'Human escalation: For conflicts, critical issues, or max iterations',
    ],
    'Production Checklist': [
        '[x] Secure code execution sandbox',
        '[x] Resource limits (CPU, memory, time)',
        '[x] Error correction loop with max iterations',
        '[x] Audit logging for all executions',
        '[x] Consensus mechanisms for multi-agent decisions',
        '[x] Human escalation for edge cases',
        '[x] Performance monitoring (latency, success rate, cost)',
        '[x] CI/CD integration for automated workflows',
    ],
}

for section, points in summary.items():
    print(f'\n{section}:')
    for point in points:
        print(f'  - {point}')

print('\n' + '=' * 100)
print('\nCOMPARISON: AutoGen vs. Other Frameworks')
print('=' * 100)

comparison = {
    'AutoGen': {
        'Best for': 'Code generation, iterative refinement',
        'Strength': 'Error correction loop, execution feedback',
        'Weakness': 'Limited to conversational pattern',
        'When to use': 'Coding tasks, data analysis, test generation',
    },
    'LangGraph': {
        'Best for': 'Complex workflows with state',
        'Strength': 'Flexible routing, checkpointing, HITL',
        'Weakness': 'More complex to set up',
        'When to use': 'Business processes, multi-step reasoning',
    },
    'CrewAI': {
        'Best for': 'Role-based task delegation',
        'Strength': 'Clear role separation, sequential/hierarchical',
        'Weakness': 'Less flexible than LangGraph',
        'When to use': 'Content creation, research workflows',
    },
}

for framework, details in comparison.items():
    print(f'\n{framework}:')
    for key, value in details.items():
        print(f'  {key}: {value}')

print('\n' + '=' * 100)
print('\nNEXT STEPS:')
print('  1. Implement safe code execution sandbox')
print('  2. Build multi-agent system for your use case')
print('  3. Set up monitoring and cost tracking')
print('  4. Test error correction loop on real tasks')
print('  5. Move to Module 6: CrewAI (role-based orchestration)')
print('\n' + '=' * 100)

### Section 5: Production AutoGen Patterns

Key patterns for production:
- **Termination conditions**: Clear success/failure criteria
- **Cost control**: Token budgets and limits
- **Conversation pruning**: Manage context length
- **Agent specialization**: Each agent has clear role
- **Output validation**: Ensure quality before returning

In [None]:
import tiktoken
from typing import List, Dict, Optional

class ProductionAutoGenSystem:
    '''Production-ready AutoGen with cost and quality controls'''
    
    def __init__(self, max_tokens_per_conversation=4000, max_cost_per_task=1.0):
        self.max_tokens = max_tokens_per_conversation
        self.max_cost = max_cost_per_task
        self.encoding = tiktoken.encoding_for_model('gpt-4')
        self.agents = {}
        self.conversation_history = []
        self.total_cost = 0.0
        self.total_tokens = 0
    
    def add_agent(self, agent):
        '''Add agent to system'''
        self.agents[agent.name] = agent
    
    def run_task(self, task: str, max_turns=10) -> dict:
        '''Execute task with cost and token controls'''
        self.conversation_history = [{'role': 'user', 'content': task}]
        self.total_cost = 0.0
        self.total_tokens = 0
        
        for turn in range(max_turns):
            # Check termination conditions
            if self._should_terminate():
                return self._build_result('terminated', turn)
            
            # Check budget
            if self.total_cost >= self.max_cost:
                return self._build_result('budget_exceeded', turn)
            
            if self.total_tokens >= self.max_tokens:
                return self._build_result('token_limit_exceeded', turn)
            
            # Select next agent
            agent = self._select_agent()
            
            # Generate response
            response = agent.respond(self.conversation_history[-1]['content'], self.conversation_history)
            
            # Track costs
            tokens = self._count_tokens(response)
            cost = self._calculate_cost(tokens)
            self.total_tokens += tokens
            self.total_cost += cost
            
            # Add to history
            self.conversation_history.append({
                'role': 'assistant',
                'agent': agent.name,
                'content': response,
                'tokens': tokens,
                'cost': cost
            })
            
            # Check if task complete
            if self._task_complete(response):
                return self._build_result('success', turn + 1)
            
            # Prune conversation if too long
            self._prune_conversation()
        
        return self._build_result('max_turns_reached', max_turns)
    
    def _count_tokens(self, text: str) -> int:
        '''Count tokens in text'''
        return len(self.encoding.encode(text))
    
    def _calculate_cost(self, tokens: int, model='gpt-4') -> float:
        '''Calculate cost for tokens'''
        # GPT-4: $0.03 input + $0.06 output per 1K tokens
        return tokens * 0.00003  # Simplified
    
    def _should_terminate(self) -> bool:
        '''Check if conversation should terminate'''
        if not self.conversation_history:
            return False
        
        last_message = self.conversation_history[-1]['content'].lower()
        
        # Termination keywords
        termination_signals = [
            'terminate',
            'task complete',
            'finished',
            'done',
            'no further action needed'
        ]
        
        return any(signal in last_message for signal in termination_signals)
    
    def _task_complete(self, response: str) -> bool:
        '''Check if task is successfully completed'''
        # In production, use more sophisticated completion detection
        completion_signals = ['complete', 'finished', 'done', 'success']
        return any(signal in response.lower() for signal in completion_signals)
    
    def _prune_conversation(self):
        '''Prune old messages to stay within token budget'''
        total_tokens = sum(self._count_tokens(m['content']) for m in self.conversation_history)
        
        while total_tokens > self.max_tokens and len(self.conversation_history) > 3:
            # Keep first message (task) and last 2 messages
            # Remove second-oldest message
            removed = self.conversation_history.pop(1)
            total_tokens -= self._count_tokens(removed['content'])
    
    def _select_agent(self):
        '''Select next agent (simplified)'''
        # In production, use sophisticated selection
        return list(self.agents.values())[0]
    
    def _build_result(self, status: str, turns: int) -> dict:
        '''Build final result'''
        return {
            'status': status,
            'turns': turns,
            'total_cost': self.total_cost,
            'total_tokens': self.total_tokens,
            'conversation': self.conversation_history,
            'final_response': self.conversation_history[-1]['content'] if self.conversation_history else None
        }
    
    def get_execution_report(self) -> str:
        '''Generate detailed execution report'''
        report = f'''\nExecution Report: {self.workflow_name}
{'=' * 80}

Conversation Statistics:
  Total turns: {len(self.conversation_history)}
  Total tokens: {self.total_tokens}
  Total cost: ${self.total_cost:.4f}
  Avg tokens/turn: {self.total_tokens / len(self.conversation_history) if self.conversation_history else 0:.0f}

Agent Participation:
'''
        
        agent_turns = defaultdict(int)
        for msg in self.conversation_history:
            if 'agent' in msg:
                agent_turns[msg['agent']] += 1
        
        for agent, turns in agent_turns.items():
            report += f'  {agent}: {turns} turns\n'
        
        report += '=' * 80
        
        return report

# Demo
print('PRODUCTION AUTOGEN WITH COST CONTROLS')
print('=' * 90)

class MockAgent:
    def __init__(self, name):
        self.name = name
    
    def respond(self, message, history):
        # Mock response
        if len(history) >= 5:
            return f'{self.name}: Task complete!'
        return f'{self.name}: Working on it... (turn {len(history)})'

system = ProductionAutoGenSystem(max_tokens_per_conversation=2000, max_cost_per_task=0.50)
system.add_agent(MockAgent('Assistant'))

task = 'Analyze this dataset and generate insights'
result = system.run_task(task, max_turns=6)

print(f'\nStatus: {result["status"]}')
print(f'Turns: {result["turns"]}')
print(f'Total cost: ${result["total_cost"]:.4f}')
print(f'Total tokens: {result["total_tokens"]}')

print(system.get_execution_report())

print('\n' + '=' * 90)
print('KEY PRODUCTION CONTROLS:')
print('  - Token budgets prevent runaway costs')
print('  - Cost tracking per conversation')
print('  - Automatic conversation pruning')
print('  - Clear termination conditions')
print('  - Execution reports for debugging')

## AutoGen Framework - Core Components

### Based on 8-Hour Curriculum

This section covers all core AutoGen components with production-ready implementations:
- AssistantAgent
- UserProxyAgent
- CodeExecutionAgent
- Role definition & specialization
- LLM integration
- Tools and functions
- Multi-agent orchestration
- Memory management
- Output parsing

### Component 1: AssistantAgent

The AssistantAgent is an LLM-powered agent that generates responses, plans actions, and uses tools.

In [None]:
from typing import List, Dict, Any, Optional, Callable
import json

class AssistantAgent:
    '''
    AssistantAgent: LLM-powered agent for reasoning and response generation.
    
    Key capabilities:
    - Generate responses using LLM
    - Use tools and functions
    - Maintain conversation context
    - Support role specialization
    '''
    
    def __init__(self,
                 name: str,
                 system_message: str,
                 llm_config: dict,
                 function_map: Optional[Dict[str, Callable]] = None):
        self.name = name
        self.system_message = system_message
        self.llm_config = llm_config
        self.function_map = function_map or {}
        self.conversation_history = []
        
        # Add system message to history
        self.conversation_history.append({
            'role': 'system',
            'content': system_message
        })
    
    def generate_reply(self, messages: List[Dict], context: Dict = None) -> str:
        '''
        Generate reply using LLM.
        
        In production, this would call actual LLM API.
        For demonstration, we simulate responses.
        '''
        
        last_message = messages[-1]['content'] if messages else ''
        
        # Build prompt with conversation history
        prompt = self._build_prompt(messages)
        
        # Call LLM (simulated)
        response = self._call_llm(prompt)
        
        # Check if function call needed
        if self._requires_function_call(response):
            function_result = self._execute_function(response)
            # Incorporate function result
            response = self._incorporate_function_result(response, function_result)
        
        return response
    
    def _build_prompt(self, messages: List[Dict]) -> str:
        '''Build prompt from conversation history'''
        prompt_parts = [self.system_message]
        
        for msg in messages[-5:]:  # Last 5 messages for context
            role = msg['role']
            content = msg['content']
            prompt_parts.append(f'{role}: {content}')
        
        return '\n'.join(prompt_parts)
    
    def _call_llm(self, prompt: str) -> str:
        '''
        Call LLM API.
        
        In production:
        import openai
        response = openai.ChatCompletion.create(
            model=self.llm_config['model'],
            messages=[{'role': 'user', 'content': prompt}],
            temperature=self.llm_config.get('temperature', 0.7)
        )
        return response.choices[0].message.content
        '''
        # Mock response
        if 'code' in prompt.lower():
            return 'I will write code to solve this. Let me use the code_executor function.'
        return f'{self.name} responding to the request.'
    
    def _requires_function_call(self, response: str) -> bool:
        '''Check if response indicates function call'''
        function_indicators = ['function:', 'call function', 'use function', 'execute:']
        return any(indicator in response.lower() for indicator in function_indicators)
    
    def _execute_function(self, response: str) -> Any:
        '''Execute function if mentioned in response'''
        # Parse function call from response
        # In production, use structured function calling
        for func_name in self.function_map.keys():
            if func_name in response.lower():
                return self.function_map[func_name]()
        return None
    
    def _incorporate_function_result(self, response: str, function_result: Any) -> str:
        '''Incorporate function result into response'''
        return f"{response}\nFunction result: {function_result}"

# Example AssistantAgent
print('ASSISTANTAGENT DEMONSTRATION')
print('=' * 90)

assistant = AssistantAgent(
    name='CodingAssistant',
    system_message='You are a helpful coding assistant. You write clean, well-documented code.',
    llm_config={'model': 'gpt-4', 'temperature': 0.3},
    function_map={
        'execute_code': lambda: 'Code executed successfully',
        'search_docs': lambda: 'Found relevant documentation',
    }
)

test_messages = [
    {'role': 'user', 'content': 'Write a function to calculate fibonacci numbers'}
]

response = assistant.generate_reply(test_messages)
print(f'User: {test_messages[0]["content"]}')
print(f'{assistant.name}: {response}')

print('\n' + '=' * 90)

### Component 2: UserProxyAgent

UserProxyAgent represents the user and can execute code on their behalf.

In [None]:
class UserProxyAgent:
    '''
    UserProxyAgent: Represents user and executes code.
    
    Key capabilities:
    - Proxy for human user
    - Execute code in safe environment
    - Collect user input
    - Provide execution feedback
    - Terminate conversations
    '''
    
    def __init__(self,
                 name: str,
                 code_execution_config: dict,
                 human_input_mode: str = 'NEVER',
                 max_consecutive_auto_reply: int = 10):
        self.name = name
        self.code_execution_config = code_execution_config
        self.human_input_mode = human_input_mode  # 'ALWAYS', 'NEVER', 'TERMINATE'
        self.max_consecutive_auto_reply = max_consecutive_auto_reply
        self.auto_reply_count = 0
        self.executor = self._init_executor()
    
    def _init_executor(self):
        '''Initialize code executor'''
        # Use SafeCodeExecutor from earlier
        return SafeCodeExecutor(
            timeout_seconds=self.code_execution_config.get('timeout', 60),
            max_memory_mb=self.code_execution_config.get('max_memory_mb', 512)
        )
    
    def generate_reply(self, messages: List[Dict], sender) -> Optional[str]:
        '''
        Generate reply (execute code or get user input).
        '''
        
        # Check if max auto-replies reached
        if self.auto_reply_count >= self.max_consecutive_auto_reply:
            return self._get_human_input('Max auto-replies reached. Your input:')
        
        last_message = messages[-1]['content']
        
        # Check if code execution requested
        if self._contains_code(last_message):
            code = self._extract_code(last_message)
            execution_result = self.executor.execute(code)
            
            self.auto_reply_count += 1
            
            if execution_result['status'] == 'success':
                return f"Execution successful:\n{execution_result['output']}"
            else:
                return f"Execution failed:\n{execution_result['error']}"
        
        # Check if human input needed
        if self.human_input_mode == 'ALWAYS':
            return self._get_human_input('Your input:')
        
        # Check for termination
        if self._should_terminate(last_message):
            return 'TERMINATE'
        
        self.auto_reply_count += 1
        return None  # No reply needed
    
    def _contains_code(self, message: str) -> bool:
        '''Check if message contains code to execute'''
        return '```python' in message or '```' in message
    
    def _extract_code(self, message: str) -> str:
        '''Extract code from message'''
        # Extract from markdown code blocks
        if '```python' in message:
            code = message.split('```python')[1].split('```')[0]
        elif '```' in message:
            code = message.split('```')[1].split('```')[0]
        else:
            code = message
        
        return code.strip()
    
    def _should_terminate(self, message: str) -> bool:
        '''Check if conversation should terminate'''
        termination_keywords = ['TERMINATE', 'task complete', 'finished', 'done with task']
        return any(kw.lower() in message.lower() for kw in termination_keywords)
    
    def _get_human_input(self, prompt: str) -> str:
        '''Get input from human (simulated)'''
        # In production: input() or UI integration
        return f'[HUMAN INPUT: {prompt}]'
    
    def reset_auto_reply_count(self):
        '''Reset auto-reply counter'''
        self.auto_reply_count = 0

# Example UserProxyAgent
print('USERPROXYAGENT DEMONSTRATION')
print('=' * 90)

user_proxy = UserProxyAgent(
    name='UserProxy',
    code_execution_config={
        'timeout': 60,
        'work_dir': './workspace',
        'use_docker': False,  # Set True in production
    },
    human_input_mode='NEVER',
    max_consecutive_auto_reply=5
)

# Test code execution
code_message = '''```python
def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)

print(fibonacci(10))
```'''

test_messages = [{'role': 'assistant', 'content': code_message}]
response = user_proxy.generate_reply(test_messages, sender='assistant')

print(f'Assistant sent code:\n{code_message[:80]}...')
print(f'\n{user_proxy.name} response:\n{response[:150]}...')

print('\n' + '=' * 90)
print('KEY USERPROXYAGENT FEATURES:')
print('  - Executes code automatically')
print('  - Can require human input at decision points')
print('  - Terminates conversations based on keywords')
print('  - Limits auto-replies to prevent infinite loops')
print('  - Provides execution feedback to other agents')

### Component 3: CodeExecutionAgent

Specialized agent for code execution with advanced sandboxing and analysis.

In [None]:
class CodeExecutionAgent:
    '''
    CodeExecutionAgent: Specialized for code execution and analysis.
    
    Features:
    - Execute code safely in sandbox
    - Analyze code for issues (security, performance)
    - Provide detailed execution reports
    - Handle runtime errors
    - Test code before execution
    '''
    
    def __init__(self, name: str, work_dir: str = './workspace', use_docker: bool = True):
        self.name = name
        self.work_dir = work_dir
        self.use_docker = use_docker
        self.executor = SafeCodeExecutor(timeout_seconds=60, max_memory_mb=512)
        self.execution_history = []
    
    def execute_code(self, code: str, language: str = 'python') -> dict:
        '''
        Execute code with comprehensive analysis.
        '''
        
        # Pre-execution analysis
        analysis = self._analyze_code(code)
        
        if analysis['security_issues']:
            return {
                'status': 'blocked',
                'reason': 'security_issues',
                'issues': analysis['security_issues'],
                'code': code
            }
        
        # Execute
        exec_result = self.executor.execute(code)
        
        # Post-execution analysis
        result = {
            'status': exec_result['status'],
            'output': exec_result.get('output'),
            'error': exec_result.get('error'),
            'code': code,
            'analysis': analysis,
            'execution_time_ms': exec_result.get('execution_time_ms', 0)
        }
        
        # Store in history
        self.execution_history.append(result)
        
        return result
    
    def _analyze_code(self, code: str) -> dict:
        '''Analyze code for issues before execution'''
        issues = {
            'security_issues': [],
            'performance_warnings': [],
            'style_suggestions': [],
        }
        
        # Security checks
        dangerous_patterns = ['os.system', 'subprocess', 'eval(', 'exec(', '__import__']
        for pattern in dangerous_patterns:
            if pattern in code:
                issues['security_issues'].append(f'Dangerous operation: {pattern}')
        
        # Performance checks
        if 'while True:' in code and 'break' not in code:
            issues['performance_warnings'].append('Potential infinite loop detected')
        
        # Count nested loops (O(n²) or worse)
        nested_loops = code.count('for ') + code.count('while ')
        if nested_loops >= 2:
            issues['performance_warnings'].append(f'Nested loops detected (possible O(n²) or worse)')
        
        return issues
    
    def test_code(self, code: str, test_cases: List[dict]) -> dict:
        '''Test code with multiple test cases'''
        results = []
        
        for test_case in test_cases:
            # Modify code to use test input
            test_code = code + f"\nprint({test_case['function_call']})"
            
            result = self.execute_code(test_code)
            
            if result['status'] == 'success':
                output = result['output'].strip()
                expected = str(test_case['expected'])
                passed = output == expected
                
                results.append({
                    'test': test_case['name'],
                    'passed': passed,
                    'expected': expected,
                    'actual': output
                })
            else:
                results.append({
                    'test': test_case['name'],
                    'passed': False,
                    'error': result['error']
                })
        
        passed_count = sum(1 for r in results if r['passed'])
        
        return {
            'total_tests': len(test_cases),
            'passed': passed_count,
            'failed': len(test_cases) - passed_count,
            'pass_rate': passed_count / len(test_cases),
            'results': results
        }
    
    def get_execution_report(self) -> str:
        '''Generate execution report'''
        total = len(self.execution_history)
        successful = sum(1 for e in self.execution_history if e['status'] == 'success')
        
        report = f'''\nCode Execution Report - {self.name}
{'=' * 70}
Total executions: {total}
Successful: {successful}
Failed: {total - successful}
Success rate: {successful / total * 100:.1f}% if total > 0 else 0
{'=' * 70}
'''
        return report

# Demo CodeExecutionAgent
print('CODEEXECUTIONAGENT DEMONSTRATION')
print('=' * 90)

code_agent = CodeExecutionAgent(
    name='CodeExecutor',
    work_dir='./workspace',
    use_docker=False
)

# Example code
fibonacci_code = '''def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)
'''

# Execute code
result = code_agent.execute_code(fibonacci_code)

print(f'Execution status: {result["status"]}')
print(f'Security issues: {len(result["analysis"]["security_issues"])}')
print(f'Performance warnings: {len(result["analysis"]["performance_warnings"])}')

# Test code
test_cases = [
    {'name': 'test_base_case', 'function_call': 'fibonacci(0)', 'expected': '0'},
    {'name': 'test_small', 'function_call': 'fibonacci(5)', 'expected': '5'},
    {'name': 'test_medium', 'function_call': 'fibonacci(10)', 'expected': '55'},
]

test_result = code_agent.test_code(fibonacci_code, test_cases)

print(f'\nTest Results: {test_result["passed"]}/{test_result["total_tests"]} passed ({test_result["pass_rate"]:.0%})')

print(code_agent.get_execution_report())

print('=' * 90)
print('CODEEXECUTIONAGENT BEST PRACTICES:')
print('  - Always analyze code before execution')
print('  - Use Docker for true isolation in production')
print('  - Set resource limits (CPU, memory, time)')
print('  - Test code with multiple test cases')
print('  - Maintain execution history for debugging')

### Component 4: Role Definition & Specialization

Defining specialized roles for agents to create expert teams.

In [None]:
class RoleDefinition:
    '''
    Define specialized agent roles with expertise and constraints.
    '''
    
    def __init__(self,
                 role_name: str,
                 expertise_areas: List[str],
                 responsibilities: List[str],
                 constraints: List[str],
                 system_message_template: str):
        self.role_name = role_name
        self.expertise_areas = expertise_areas
        self.responsibilities = responsibilities
        self.constraints = constraints
        self.system_message_template = system_message_template
    
    def build_system_message(self, **kwargs) -> str:
        '''Build system message for agent'''
        base_message = self.system_message_template.format(**kwargs)
        
        full_message = f'''Role: {self.role_name}

Expertise:
{chr(10).join(f'- {area}' for area in self.expertise_areas)}

Responsibilities:
{chr(10).join(f'- {resp}' for resp in self.responsibilities)}

Constraints:
{chr(10).join(f'- {const}' for const in self.constraints)}

{base_message}
'''
        return full_message

# Define specialized roles
print('ROLE DEFINITION & SPECIALIZATION')
print('=' * 90)

# Define role templates
roles = {
    'Senior_Developer': RoleDefinition(
        role_name='Senior Software Developer',
        expertise_areas=['Python', 'System Design', 'Best Practices', 'Testing'],
        responsibilities=[
            'Write clean, maintainable code',
            'Follow design patterns',
            'Include error handling',
            'Add documentation and tests'
        ],
        constraints=[
            'Never use deprecated libraries',
            'Always handle exceptions',
            'Code must be PEP8 compliant',
            'Include type hints'
        ],
        system_message_template='You are a senior developer. {task}'
    ),
    
    'Code_Reviewer': RoleDefinition(
        role_name='Code Reviewer',
        expertise_areas=['Code Quality', 'Security', 'Performance', 'Testing'],
        responsibilities=[
            'Review code for bugs',
            'Check security vulnerabilities',
            'Assess performance',
            'Verify test coverage'
        ],
        constraints=[
            'Be objective and constructive',
            'Cite specific issues',
            'Provide improvement suggestions',
            'Check against style guide'
        ],
        system_message_template='You are a code reviewer. {task}'
    ),
    
    'QA_Engineer': RoleDefinition(
        role_name='QA Engineer',
        expertise_areas=['Testing', 'Edge Cases', 'Test Design', 'Quality Assurance'],
        responsibilities=[
            'Design comprehensive test cases',
            'Identify edge cases',
            'Verify requirements',
            'Report defects'
        ],
        constraints=[
            'Cover happy path and edge cases',
            'Include boundary testing',
            'Document test scenarios',
            'Verify expected behavior'
        ],
        system_message_template='You are a QA engineer. {task}'
    ),
}

# Create specialized agents
specialized_agents = {}

for role_key, role_def in roles.items():
    system_msg = role_def.build_system_message(task='Focus on your area of expertise.')
    
    agent = AssistantAgent(
        name=role_key,
        system_message=system_msg,
        llm_config={'model': 'gpt-4', 'temperature': 0.3}
    )
    
    specialized_agents[role_key] = agent
    
    print(f'\nAgent: {role_def.role_name}')
    print(f'Expertise: {', '.join(role_def.expertise_areas)}')
    print(f'Responsibilities: {len(role_def.responsibilities)}')

print('\n' + '=' * 90)
print('ROLE SPECIALIZATION BENEFITS:')
print('  - Clear separation of concerns')
print('  - Domain expertise per agent')
print('  - Better output quality')
print('  - Easier debugging (know which agent did what)')
print('  - Scalable team structure')

### Component 5: Multi-Agent Orchestration Patterns

Different orchestration patterns for different use cases.

In [None]:
from enum import Enum

class OrchestrationMode(Enum):
    SEQUENTIAL = 'sequential'  # One after another
    ROUND_ROBIN = 'round_robin'  # Take turns
    GROUP_CHAT = 'group_chat'  # Dynamic selection
    HIERARCHICAL = 'hierarchical'  # Manager delegates

class AutoGenOrchestrator:
    '''
    Orchestrate multiple AutoGen agents with different patterns.
    '''
    
    def __init__(self, agents: List, mode: OrchestrationMode = OrchestrationMode.SEQUENTIAL):
        self.agents = {agent.name: agent for agent in agents}
        self.mode = mode
        self.conversation_log = []
        self.current_speaker_index = 0
    
    def run(self, initial_message: str, max_turns: int = 10) -> dict:
        '''Run orchestrated conversation'''
        
        self.conversation_log = [{'role': 'user', 'content': initial_message}]
        
        for turn in range(max_turns):
            # Select next speaker
            speaker = self._select_next_speaker()
            
            if not speaker:
                break
            
            # Generate response
            response = speaker.generate_reply(self.conversation_log)
            
            if response is None:
                continue
            
            if response == 'TERMINATE':
                print(f'\nConversation terminated by {speaker.name}')
                break
            
            # Add to log
            self.conversation_log.append({
                'role': 'assistant',
                'name': speaker.name,
                'content': response
            })
            
            print(f'\nTurn {turn + 1} - {speaker.name}:')
            print(f'  {response[:100]}...' if len(response) > 100 else f'  {response}')
            
            # Check termination
            if self._should_terminate():
                break
        
        return {
            'conversation': self.conversation_log,
            'total_turns': turn + 1,
            'mode': self.mode.value
        }
    
    def _select_next_speaker(self):
        '''Select next agent based on orchestration mode'''
        
        if self.mode == OrchestrationMode.SEQUENTIAL:
            # Each agent speaks once in order
            agent_list = list(self.agents.values())
            if self.current_speaker_index < len(agent_list):
                speaker = agent_list[self.current_speaker_index]
                self.current_speaker_index += 1
                return speaker
            return None
        
        elif self.mode == OrchestrationMode.ROUND_ROBIN:
            # Agents take turns
            agent_list = list(self.agents.values())
            speaker = agent_list[self.current_speaker_index % len(agent_list)]
            self.current_speaker_index += 1
            return speaker
        
        elif self.mode == OrchestrationMode.GROUP_CHAT:
            # Most relevant agent speaks (based on last message)
            return self._select_by_expertise()
        
        elif self.mode == OrchestrationMode.HIERARCHICAL:
            # Manager delegates
            return self._manager_selects()
    
    def _select_by_expertise(self):
        '''Select agent with relevant expertise (group chat)'''
        if not self.conversation_log:
            return list(self.agents.values())[0]
        
        last_message = self.conversation_log[-1]['content'].lower()
        
        # Match keywords to agent expertise (simplified)
        keyword_to_agent = {
            'code': 'Senior_Developer',
            'test': 'QA_Engineer',
            'review': 'Code_Reviewer',
            'security': 'Code_Reviewer',
            'bug': 'Senior_Developer',
        }
        
        for keyword, agent_name in keyword_to_agent.items():
            if keyword in last_message and agent_name in self.agents:
                return self.agents[agent_name]
        
        # Default to first agent
        return list(self.agents.values())[0]
    
    def _manager_selects(self):
        '''Manager decides which agent to use'''
        # In production, manager agent uses LLM to decide
        return list(self.agents.values())[0]
    
    def _should_terminate(self) -> bool:
        '''Check if conversation should end'''
        if not self.conversation_log:
            return False
        
        last_content = self.conversation_log[-1]['content'].lower()
        return 'terminate' in last_content or 'task complete' in last_content

# Demo different orchestration modes
print('MULTI-AGENT ORCHESTRATION PATTERNS')
print('=' * 90)

# Create agent team
developer = AssistantAgent(
    'Developer',
    'You write code.',
    {'model': 'gpt-4', 'temperature': 0.3}
)

reviewer = AssistantAgent(
    'Reviewer',
    'You review code for quality.',
    {'model': 'gpt-4', 'temperature': 0.3}
)

tester = AssistantAgent(
    'Tester',
    'You create test cases.',
    {'model': 'gpt-4', 'temperature': 0.3}
)

# Test sequential mode
print('\n1. SEQUENTIAL MODE:')
print('-' * 80)
orchestrator_seq = AutoGenOrchestrator(
    [developer, reviewer, tester],
    mode=OrchestrationMode.SEQUENTIAL
)

task = 'Create a function to validate email addresses'
result = orchestrator_seq.run(task, max_turns=3)
print(f'Completed in {result["total_turns"]} turns')

print('\n' + '=' * 90)
print('ORCHESTRATION MODE GUIDE:')
print('  - SEQUENTIAL: Fixed order (best for pipelines)')
print('  - ROUND_ROBIN: Fair participation (best for brainstorming)')
print('  - GROUP_CHAT: Dynamic selection (best for complex tasks)')
print('  - HIERARCHICAL: Manager delegates (best for large teams)')