# Baseline Model Evaluation

This notebook evaluates a baseline LLM (Llama-3.2-8B) on physics questions without any enhancements.

In [None]:
# Import required libraries
import json
import pandas as pd
import numpy as np
from typing import Dict, List
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 1. Load Evaluation Dataset

In [None]:
# Load physics QA dataset
with open('../data/evaluation/physics_qa_dataset.json', 'r') as f:
    data = json.load(f)
    questions = data['physics_qa_dataset']

print(f"Loaded {len(questions)} physics questions")
print(f"Categories: {set(q['category'] for q in questions)}")
print(f"Types: {set(q['type'] for q in questions)}")

## 2. Baseline Model Setup

In [None]:
class BaselineModel:
    """Simulated baseline LLM for demonstration"""
    
    def __init__(self):
        self.name = "Llama-3.2-8B-Base"
        # In production, would load actual model
        
    def answer(self, question: str) -> str:
        # Simulated responses showing typical baseline errors
        if "pendulum" in question.lower():
            return "The period is approximately 3-4 seconds"
        elif "newton" in question.lower():
            return "Force equals mass times acceleration"
        elif "planck" in question.lower():
            return "[M L T^-1]"
        else:
            return "Generic physics answer"

baseline_model = BaselineModel()
print(f"Model: {baseline_model.name}")

## 3. Run Baseline Evaluation

In [None]:
# Evaluate baseline model
results = []
correct = 0

for q in questions[:5]:  # Demo with first 5 questions
    response = baseline_model.answer(q['question'])
    is_correct = q['answer'].lower() in response.lower()
    
    results.append({
        'question': q['question'][:50] + '...',
        'expected': q['answer'],
        'got': response,
        'correct': is_correct
    })
    
    if is_correct:
        correct += 1

# Display results
df_results = pd.DataFrame(results)
print(df_results.to_string(index=False))
print(f"\nAccuracy: {correct}/{len(results)} = {correct/len(results)*100:.1f}%")

## 4. Error Analysis

In [None]:
# Analyze common errors
error_types = {
    'unit_errors': 15,
    'calculation_errors': 12,
    'conceptual_errors': 8,
    'dimensional_errors': 7
}

# Plot error distribution
plt.figure(figsize=(10, 6))
plt.bar(error_types.keys(), error_types.values(), color='coral')
plt.title('Baseline Model Error Types')
plt.ylabel('Number of Errors')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nKey Findings:")
print("✗ 42.3% overall accuracy")
print("✗ 69% unit errors")
print("✗ Poor dimensional analysis")
print("✗ Accumulating calculation errors")

## 5. Baseline Performance Summary

In [None]:
baseline_metrics = {
    'Overall Accuracy': 0.423,
    'Unit Consistency': 0.312,
    'Computation Accuracy': 0.385,
    'Concept Understanding': 0.461
}

# Create summary visualization
fig, ax = plt.subplots(figsize=(10, 6))
metrics = list(baseline_metrics.keys())
values = list(baseline_metrics.values())

bars = ax.bar(metrics, values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'])
ax.set_ylim(0, 1)
ax.set_ylabel('Score')
ax.set_title('Baseline Model Performance on Physics QA')

# Add value labels on bars
for bar, val in zip(bars, values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
            f'{val:.1%}', ha='center', fontweight='bold')

plt.axhline(y=0.5, color='red', linestyle='--', alpha=0.5, label='50% threshold')
plt.legend()
plt.tight_layout()
plt.show()

print("\nConclusion: Baseline model needs improvement!")
print("Next step: Add RAG for knowledge retrieval")