# Gemma Model Evaluation with Inspect AI

This notebook demonstrates how to use Inspect AI for comprehensive evaluation of Gemma models.

## Installation

First, install the required packages:

In [None]:
# Install Inspect AI and dependencies
!pip install inspect-ai
!pip install -r ../requirements.txt

## 1. Basic Evaluation Setup

In [None]:
import sys
sys.path.append('..')

from src.evaluation.inspect_evaluator import GemmaInspectEvaluator
from src.models.gemma_model import GemmaModel
import json

## 2. Initialize Evaluator

In [None]:
# Initialize evaluator with your model
model_path = "google/gemma-2b"  # or path to your fine-tuned model
evaluator = GemmaInspectEvaluator(model_path)

## 3. Run Coding Evaluation

In [None]:
# Create and run coding evaluation
coding_task = evaluator.create_coding_eval()
coding_results = evaluator.run_evaluation(
    coding_task,
    log_dir="./logs/coding"
)

print("Coding Evaluation Results:")
print(f"Accuracy: {coding_results['scores']['accuracy']:.2%}")
print(f"Mean Score: {coding_results['scores']['mean_score']:.3f}")

## 4. Run Reasoning Evaluation

In [None]:
# Create and run reasoning evaluation
reasoning_task = evaluator.create_reasoning_eval()
reasoning_results = evaluator.run_evaluation(
    reasoning_task,
    log_dir="./logs/reasoning"
)

print("Reasoning Evaluation Results:")
print(f"Accuracy: {reasoning_results['scores']['accuracy']:.2%}")
print(f"Mean Score: {reasoning_results['scores']['mean_score']:.3f}")

## 5. Run Safety Evaluation

In [None]:
# Create and run safety evaluation
safety_task = evaluator.create_safety_eval()
safety_results = evaluator.run_evaluation(
    safety_task,
    log_dir="./logs/safety"
)

print("Safety Evaluation Results:")
print(f"Accuracy: {safety_results['scores']['accuracy']:.2%}")
print(f"Mean Score: {safety_results['scores']['mean_score']:.3f}")

## 6. Create Custom Evaluation

In [None]:
# Create a custom evaluation for your specific use case
custom_dataset = [
    {
        "input": "Translate 'Hello world' to Spanish",
        "target": "Hola mundo"
    },
    {
        "input": "What is the capital of France?",
        "target": "Paris"
    },
    {
        "input": "Calculate 15% of 200",
        "target": "30"
    }
]

custom_task = evaluator.create_custom_eval(
    name="custom_knowledge",
    dataset=custom_dataset,
    system_prompt="You are a helpful assistant. Answer concisely and accurately.",
    scoring_method="includes"
)

custom_results = evaluator.run_evaluation(
    custom_task,
    log_dir="./logs/custom"
)

print("Custom Evaluation Results:")
print(f"Accuracy: {custom_results['scores']['accuracy']:.2%}")

## 7. Run Comprehensive Evaluation Suite

In [None]:
from src.evaluation.inspect_evaluator import run_comprehensive_evaluation

# Run full evaluation suite
comprehensive_results = run_comprehensive_evaluation(
    model_path=model_path,
    output_dir="./evaluation_results"
)

print("\nComprehensive Evaluation Summary:")
print(f"Overall Score: {comprehensive_results['overall_score']:.2%}")
print("\nDetailed Results:")
for eval_name, results in comprehensive_results['evaluations'].items():
    print(f"\n{eval_name.capitalize()}:")
    print(f"  - Accuracy: {results['scores']['accuracy']:.2%}")
    print(f"  - Duration: {results['duration']:.2f}s")

## 8. Compare Models

In [None]:
# Compare base model vs fine-tuned model
models_to_compare = [
    "google/gemma-2b",
    "./outputs/checkpoints/final"  # Your fine-tuned model
]

comparison_results = {}

for model in models_to_compare:
    print(f"\nEvaluating {model}...")
    results = run_comprehensive_evaluation(
        model_path=model,
        output_dir=f"./comparison/{model.replace('/', '_')}"
    )
    comparison_results[model] = results

# Display comparison
print("\n" + "="*50)
print("Model Comparison Results")
print("="*50)

for model, results in comparison_results.items():
    print(f"\nModel: {model}")
    print(f"Overall Score: {results['overall_score']:.2%}")
    for eval_name, eval_results in results['evaluations'].items():
        print(f"  {eval_name}: {eval_results['scores']['accuracy']:.2%}")

## 9. Export Results

In [None]:
# Save comprehensive results to file
with open('evaluation_report.json', 'w') as f:
    json.dump(comprehensive_results, f, indent=2)

print("Results saved to evaluation_report.json")

# Generate markdown report
report = f"""# Gemma Model Evaluation Report

## Model: {comprehensive_results['model']}

### Overall Score: {comprehensive_results['overall_score']:.2%}

### Detailed Results:

"""

for eval_name, results in comprehensive_results['evaluations'].items():
    report += f"\n#### {eval_name.capitalize()} Evaluation\n"
    report += f"- Accuracy: {results['scores']['accuracy']:.2%}\n"
    report += f"- Mean Score: {results['scores']['mean_score']:.3f}\n"
    report += f"- Std Dev: {results['scores']['std_score']:.3f}\n"
    report += f"- Duration: {results['duration']:.2f}s\n"

with open('evaluation_report.md', 'w') as f:
    f.write(report)

print("Markdown report saved to evaluation_report.md")

## 10. Visualize Results

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract scores for visualization
eval_names = list(comprehensive_results['evaluations'].keys())
scores = [results['scores']['accuracy'] for results in comprehensive_results['evaluations'].values()]

# Create bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(eval_names, scores)

# Customize colors
colors = ['#4CAF50', '#2196F3', '#FF9800']
for bar, color in zip(bars, colors):
    bar.set_color(color)

plt.title(f'Gemma Model Evaluation Results\nOverall Score: {comprehensive_results["overall_score"]:.2%}')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for i, (name, score) in enumerate(zip(eval_names, scores)):
    plt.text(i, score + 0.02, f'{score:.1%}', ha='center')

plt.tight_layout()
plt.savefig('evaluation_results.png', dpi=300)
plt.show()