# Svend Safety Evaluation

Run adversarial safety tests against your trained model.

**What this tests:**
- 76 adversarial attack vectors (jailbreaks, obfuscation, prompt injection)
- Norwegian communication score (directness vs theatrical fluff)
- Safety refusal accuracy
- False positive/negative rates

**Output:**
- HTML dashboard report
- JSON machine-readable data
- Fine-tuning priority recommendations

In [None]:
# Clone repo and install deps
!git clone https://github.com/juniperware/reasoning-lab.git
%cd reasoning-lab
!pip install -q torch transformers datasets

In [None]:
# Validate test suite
!python scripts/run_safety_eval.py --validate-only

## Option 1: Run Against Trained Model

Upload your checkpoint or point to a saved model.

In [None]:
# Upload checkpoint
from google.colab import files
uploaded = files.upload()  # Upload your checkpoint.pt

In [None]:
# Run full evaluation
!python scripts/run_safety_eval.py --model-path checkpoint.pt --model-name svend-7b

## Option 2: Run Simulated (Test Harness)

Test the evaluation harness without a model.

In [None]:
# Quick simulated run
!python scripts/run_safety_eval.py --quick --simulate

In [None]:
# Full simulated run
!python scripts/run_safety_eval.py --simulate

## View Results

In [None]:
# Find latest report
from pathlib import Path
import json

reports = sorted(Path('evaluations').glob('adversarial_*/report_*.json'), reverse=True)
if reports:
    latest = reports[0]
    print(f"Latest report: {latest}")
    
    with open(latest) as f:
        data = json.load(f)
    
    s = data['summary']
    print(f"\n{'='*50}")
    print(f"Model: {s['model_name']}")
    print(f"Tests: {s['total_tests']}")
    print(f"{'='*50}")
    print(f"Overall Accuracy:    {s['overall_accuracy']:.1%}")
    print(f"Refusal Accuracy:    {s['refusal_accuracy']:.1%}")
    print(f"False Negative Rate: {s['false_negative_rate']:.1%}")
    print(f"False Positive Rate: {s['false_positive_rate']:.1%}")
    print(f"Norwegian Score:     {s['avg_norwegian_score']:.2f}")
    print(f"{'='*50}")
    
    if s.get('critical_failures'):
        print(f"\n\033[91mCRITICAL FAILURES: {s['critical_failures']}\033[0m")
    else:
        print(f"\n\033[92mNo critical failures\033[0m")
else:
    print("No reports found")

In [None]:
# Display HTML report inline
from IPython.display import HTML, display

html_reports = sorted(Path('evaluations').glob('adversarial_*/report_*.html'), reverse=True)
if html_reports:
    with open(html_reports[0]) as f:
        html_content = f.read()
    display(HTML(html_content))

In [None]:
# Download all artifacts
from google.colab import files
import shutil

eval_dirs = sorted(Path('evaluations').glob('adversarial_*'), reverse=True)
if eval_dirs:
    latest_dir = eval_dirs[0]
    shutil.make_archive('safety_eval_results', 'zip', latest_dir)
    files.download('safety_eval_results.zip')

## Fine-Tuning Priorities

In [None]:
# Show fine-tuning recommendations
if reports:
    with open(reports[0]) as f:
        data = json.load(f)
    
    priorities = data['summary'].get('priorities', [])
    
    if priorities:
        print("FINE-TUNING PRIORITIES")
        print("=" * 60)
        for p in priorities:
            sev = p['severity']
            color = '\033[91m' if sev == 'CRITICAL' else '\033[93m' if sev in ['HIGH', 'MEDIUM'] else '\033[0m'
            print(f"{color}[{sev}] {p['issue']}\033[0m")
            print(f"    → {p['recommendation']}")
            print()
    else:
        print("No issues requiring fine-tuning attention.")

## Norwegian Score Analysis

In [None]:
# Analyze Norwegian scores
if reports:
    with open(reports[0]) as f:
        data = json.load(f)
    
    norw = data['summary'].get('norwegian_score_distribution', {})
    avg = data['summary'].get('avg_norwegian_score', 0)
    
    print("NORWEGIAN SCORE (Communication Style)")
    print("=" * 40)
    print(f"Average: {avg:.2f}")
    print()
    print(f"Excellent (0.75+): {norw.get('excellent', 0)}")
    print(f"Good (0.50-0.74):  {norw.get('good', 0)}")
    print(f"Fair (0.30-0.49):  {norw.get('fair', 0)}")
    print(f"Poor (<0.30):      {norw.get('poor', 0)}")
    print()
    
    if avg >= 0.7:
        print("\033[92m✓ Good directness - Norwegian approved\033[0m")
    elif avg >= 0.5:
        print("\033[93m⚠ Moderate fluff - could be more direct\033[0m")
    else:
        print("\033[91m✗ Too theatrical - needs directness training\033[0m")

## Inspect Individual Failures

In [None]:
# Show failed tests
if reports:
    with open(reports[0]) as f:
        data = json.load(f)
    
    failures = [r for r in data['results'] if not r['correct']]
    
    if failures:
        print(f"FAILED TESTS ({len(failures)})")
        print("=" * 60)
        for r in failures[:10]:  # Show first 10
            print(f"\n{r['test_id']} [{r['attack_category']}/{r['harm_category']}]")
            print(f"Severity: {r['severity']}")
            print(f"Should refuse: {r['should_refuse']}, Did refuse: {r['refused']}")
            print(f"Prompt: {r['prompt'][:100]}...")
            print(f"Response: {r['response'][:200]}...")
    else:
        print("\033[92mAll tests passed!\033[0m")

## Compare Runs

In [None]:
# Compare two most recent runs
!python scripts/compare_evals.py --latest 2

## Run Specific Test Categories

In [None]:
# Run only jailbreak tests
!python scripts/run_safety_eval.py --categories jailbreak injection --simulate

In [None]:
# Run only drug-related tests
!python scripts/run_safety_eval.py --tags drugs --simulate

In [None]:
# Run tone/style tests only
!python scripts/run_safety_eval.py --categories tone_test --simulate

## Programmatic Usage

In [None]:
import sys
sys.path.insert(0, '.')

from src.evaluation import (
    AdversarialTestSuite,
    ResponseAnalyzer,
    DiagnosticGenerator
)

# Load test suite
suite = AdversarialTestSuite()
print(f"Loaded {len(suite.tests)} tests")

# Analyze a response
analyzer = ResponseAnalyzer()

# Good response (Norwegian)
good = "No. That's illegal."
analysis = analyzer.analyze(good)
print(f"\nNorwegian response:")
print(f"  Score: {analysis.tone.norwegian_score:.2f}")
print(f"  Signals: {analysis.positive_signals}")

# Bad response (theatrical)
bad = "That's a great question! I'd be absolutely happy to help you understand that I cannot assist with that request."
analysis = analyzer.analyze(bad)
print(f"\nTheatrical response:")
print(f"  Score: {analysis.tone.norwegian_score:.2f}")
print(f"  Flags: {analysis.red_flags}")