# Volume 1, Chapter 11: Testing and Validation

**Test Your AI Systems Systematically**

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/eduardd76/AI_for_networking_and_security_engineers/blob/main/Volume-1-Foundations/Colab-Notebooks/Vol1_Ch11_Testing.ipynb)

---

**What you'll learn:**
- ‚úÖ Unit test prompts and outputs
- üìä Measure accuracy and quality
- üîÑ Detect regressions
- üß™ Generate test data

In [None]:
!pip install -q anthropic

import os
from getpass import getpass

try:
    from google.colab import userdata
    os.environ['ANTHROPIC_API_KEY'] = userdata.get('ANTHROPIC_API_KEY')
except:
    if 'ANTHROPIC_API_KEY' not in os.environ:
        os.environ['ANTHROPIC_API_KEY'] = getpass('Anthropic API key: ')

from anthropic import Anthropic
client = Anthropic()
print("‚úì Ready!")

---
## ‚úÖ Example 1: Unit Test for Prompts

In [None]:
class PromptTester:
    """Test prompts with expected outputs."""
    
    def __init__(self):
        self.results = []
    
    def test(self, name, prompt, expected_contains, model="claude-3-5-haiku-20241022"):
        """Run a single test."""
        response = client.messages.create(
            model=model,
            max_tokens=200,
            temperature=0,
            messages=[{"role": "user", "content": prompt}]
        )
        
        output = response.content[0].text.lower()
        passed = all(exp.lower() in output for exp in expected_contains)
        
        self.results.append({
            "name": name,
            "passed": passed,
            "output": response.content[0].text[:100]
        })
        
        return passed
    
    def report(self):
        passed = sum(1 for r in self.results if r["passed"])
        total = len(self.results)
        
        print(f"\n{'='*60}")
        print(f"TEST RESULTS: {passed}/{total} passed ({passed/total*100:.0f}%)")
        print(f"{'='*60}\n")
        
        for r in self.results:
            status = "‚úÖ" if r["passed"] else "‚ùå"
            print(f"{status} {r['name']}")
            if not r["passed"]:
                print(f"   Output: {r['output']}")

# Run tests
tester = PromptTester()

print("‚úÖ PROMPT UNIT TESTS")
print("=" * 60)

# Test 1: Log classification
tester.test(
    "Log classification - OSPF down",
    "Classify severity (INFO/WARNING/ERROR/CRITICAL): %OSPF-5-ADJCHG: Neighbor DOWN",
    ["error", "critical"]  # Should contain one of these
)

# Test 2: IP extraction
tester.test(
    "IP extraction",
    "Extract IP from: ip address 192.168.1.1 255.255.255.0. Return only IP.",
    ["192.168.1.1"]
)

# Test 3: Subnet calculation
tester.test(
    "Subnet hosts",
    "Usable hosts in /24? Return only the number.",
    ["254"]
)

# Test 4: Protocol identification
tester.test(
    "Protocol ID",
    "What protocol uses port 179? One word answer.",
    ["bgp"]
)

tester.report()

---
## üìä Example 2: Accuracy Measurement

In [None]:
def measure_accuracy(test_cases):
    """Measure accuracy on labeled test data."""
    correct = 0
    results = []
    
    for test in test_cases:
        response = client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=50,
            temperature=0,
            messages=[{"role": "user", "content": test["prompt"]}]
        )
        
        predicted = response.content[0].text.strip().upper()
        expected = test["expected"].upper()
        is_correct = expected in predicted
        
        correct += is_correct
        results.append({
            "input": test["prompt"][:50],
            "expected": expected,
            "predicted": predicted[:20],
            "correct": is_correct
        })
    
    return {
        "accuracy": correct / len(test_cases),
        "correct": correct,
        "total": len(test_cases),
        "details": results
    }

# Test cases for log severity classification
test_cases = [
    {"prompt": "Classify (INFO/ERROR): %SYS-5-CONFIG_I: Configured from console", "expected": "INFO"},
    {"prompt": "Classify (INFO/ERROR): %LINK-3-UPDOWN: Interface down", "expected": "ERROR"},
    {"prompt": "Classify (INFO/ERROR): %OSPF-5-ADJCHG: Neighbor FULL", "expected": "INFO"},
    {"prompt": "Classify (INFO/ERROR): %SYS-2-MALLOCFAIL: Memory allocation failed", "expected": "ERROR"},
]

results = measure_accuracy(test_cases)

print("üìä ACCURACY MEASUREMENT")
print("=" * 60)
print(f"Accuracy: {results['accuracy']*100:.0f}% ({results['correct']}/{results['total']})")
print("\nDetails:")
for r in results['details']:
    status = "‚úÖ" if r['correct'] else "‚ùå"
    print(f"  {status} Expected: {r['expected']}, Got: {r['predicted']}")

---
## üîÑ Example 3: Regression Detection

In [None]:
import json
import hashlib

class RegressionTracker:
    """Track output changes across runs."""
    
    def __init__(self):
        self.baseline = {}
    
    def set_baseline(self, test_name, output):
        """Save baseline output."""
        self.baseline[test_name] = {
            "output": output,
            "hash": hashlib.md5(output.encode()).hexdigest()[:8]
        }
    
    def check(self, test_name, current_output):
        """Check for regression."""
        if test_name not in self.baseline:
            return {"status": "NEW", "message": "No baseline"}
        
        baseline = self.baseline[test_name]
        current_hash = hashlib.md5(current_output.encode()).hexdigest()[:8]
        
        if current_hash == baseline["hash"]:
            return {"status": "SAME", "message": "No change"}
        else:
            return {
                "status": "CHANGED",
                "message": "Output differs from baseline",
                "baseline": baseline["output"][:100],
                "current": current_output[:100]
            }

# Demo
tracker = RegressionTracker()

# Set baseline
prompt = "What does OSPF stand for? One line answer."
response1 = client.messages.create(
    model="claude-3-5-haiku-20241022",
    max_tokens=50,
    temperature=0,
    messages=[{"role": "user", "content": prompt}]
)
tracker.set_baseline("ospf_definition", response1.content[0].text)

# Check again (should be same with temperature=0)
response2 = client.messages.create(
    model="claude-3-5-haiku-20241022",
    max_tokens=50,
    temperature=0,
    messages=[{"role": "user", "content": prompt}]
)
result = tracker.check("ospf_definition", response2.content[0].text)

print("üîÑ REGRESSION CHECK")
print("=" * 60)
print(f"Status: {result['status']}")
print(f"Message: {result['message']}")

---
## üß™ Example 4: Generate Test Data

In [None]:
def generate_test_data(category, count=5):
    """Use AI to generate test cases."""
    
    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=1000,
        temperature=0.7,  # Some creativity for variety
        messages=[{
            "role": "user",
            "content": f"""Generate {count} realistic {category} test cases.

Return JSON array:
[
  {{"input": "...", "expected_output": "...", "description": "..."}}
]

Make them diverse and realistic. ONLY JSON."""
        }]
    )
    
    import re
    text = response.content[0].text
    json_match = re.search(r'\[.*\]', text, re.DOTALL)
    return json.loads(json_match.group()) if json_match else []

# Generate syslog test cases
test_data = generate_test_data("Cisco syslog messages with severity levels", 5)

print("üß™ GENERATED TEST DATA")
print("=" * 60)
print(f"Generated {len(test_data)} test cases:\n")

for i, tc in enumerate(test_data, 1):
    print(f"{i}. {tc.get('description', 'Test case')}")
    print(f"   Input: {tc.get('input', '')[:60]}...")
    print(f"   Expected: {tc.get('expected_output', '')}\n")

---
## üìà Example 5: Quality Metrics Dashboard

In [None]:
def quality_dashboard(test_suite):
    """Run full test suite and show metrics."""
    
    results = {
        "total": len(test_suite),
        "passed": 0,
        "failed": 0,
        "by_category": {}
    }
    
    for test in test_suite:
        response = client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=100,
            temperature=0,
            messages=[{"role": "user", "content": test["prompt"]}]
        )
        
        output = response.content[0].text.lower()
        passed = test["expected"].lower() in output
        
        if passed:
            results["passed"] += 1
        else:
            results["failed"] += 1
        
        category = test.get("category", "other")
        if category not in results["by_category"]:
            results["by_category"][category] = {"passed": 0, "failed": 0}
        results["by_category"][category]["passed" if passed else "failed"] += 1
    
    return results

# Test suite
test_suite = [
    {"prompt": "Port for SSH?", "expected": "22", "category": "protocols"},
    {"prompt": "Port for HTTPS?", "expected": "443", "category": "protocols"},
    {"prompt": "Port for BGP?", "expected": "179", "category": "protocols"},
    {"prompt": "Hosts in /30?", "expected": "2", "category": "subnetting"},
    {"prompt": "Hosts in /24?", "expected": "254", "category": "subnetting"},
]

results = quality_dashboard(test_suite)

print("üìà QUALITY DASHBOARD")
print("=" * 60)
print(f"\nOverall: {results['passed']}/{results['total']} ({results['passed']/results['total']*100:.0f}%)")
print(f"\nBy Category:")
for cat, stats in results["by_category"].items():
    total = stats["passed"] + stats["failed"]
    pct = stats["passed"] / total * 100
    print(f"  {cat}: {stats['passed']}/{total} ({pct:.0f}%)")

---
## üéØ Key Takeaways

| Test Type | Purpose | When to Run |
|-----------|---------|-------------|
| Unit tests | Verify prompts work | Every commit |
| Accuracy tests | Measure quality | Weekly |
| Regression tests | Detect changes | After model updates |
| Generated tests | Cover edge cases | Periodically |

**Testing best practices:**
1. Use temperature=0 for deterministic tests
2. Save baselines for regression detection
3. Test on diverse, realistic data
4. Track metrics over time

---

## üìö Next Steps

‚û°Ô∏è [Chapter 12: Ethics and Responsible AI](./Vol1_Ch12_Ethics.ipynb)