# Debug Experiments - Detailed LLM Call Inspection

This notebook runs examples with detailed logging to inspect:
- Tool calls and intermediate steps
- LLM reasoning and responses
- Answer correctness
- Behavior validation

In [None]:
import sys

sys.path.insert(0, "..")

import json
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass, field

from self_distill.clients.ollama_client import OllamaClient
from self_distill.datasets import DATA, load_dataset

## 1. Verbose Client Wrapper

Wrap the client to capture and display all calls.

In [None]:
@dataclass
class CallRecord:
    """Record of a single LLM call."""

    call_id: int
    model: str
    messages: list[dict]
    response: str
    prompt_tokens: int
    completion_tokens: int
    timestamp: str


@dataclass
class ExperimentRun:
    """A complete experiment run with all calls."""

    name: str
    input_text: str
    expected_output: str | None
    calls: list[CallRecord] = field(default_factory=list)
    final_response: str = ""
    is_correct: bool | None = None
    notes: str = ""


class VerboseClient:
    """Wrapper that logs all LLM calls with full details."""

    def __init__(self, client: OllamaClient, verbose: bool = True):
        self.client = client
        self.verbose = verbose
        self.call_history: list[CallRecord] = []
        self.call_counter = 0

    def completion(self, prompt: str | list[dict], model: str | None = None) -> str:
        """Make a completion call with logging."""
        self.call_counter += 1
        model = model or self.client.model_name

        # Normalize to messages format
        if isinstance(prompt, str):
            messages = [{"role": "user", "content": prompt}]
        else:
            messages = prompt

        if self.verbose:
            print(f"\n{'=' * 60}")
            print(f"CALL #{self.call_counter} | Model: {model}")
            print(f"{'=' * 60}")
            for msg in messages:
                role = msg["role"].upper()
                content = msg["content"]
                print(f"\n[{role}]")
                print(content[:500] + "..." if len(content) > 500 else content)

        # Make the actual call
        response = self.client.completion(messages, model=model)
        usage = self.client.get_last_usage()

        # Extract token counts (usage is ModelUsageSummary)
        prompt_tokens = usage.total_input_tokens
        completion_tokens = usage.total_output_tokens

        # Record the call
        record = CallRecord(
            call_id=self.call_counter,
            model=model,
            messages=messages,
            response=response,
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            timestamp=datetime.now().isoformat(),
        )
        self.call_history.append(record)

        if self.verbose:
            print(f"\n[RESPONSE] ({completion_tokens} tokens)")
            print(response[:1000] + "..." if len(response) > 1000 else response)
            print(f"\n{'─' * 60}")

        return response

    def clear_history(self):
        """Clear call history for a new experiment."""
        self.call_history = []
        self.call_counter = 0

    def get_history(self) -> list[CallRecord]:
        """Get copy of call history."""
        return list(self.call_history)

    def list_models(self):
        return self.client.list_models()

    def get_usage_summary(self):
        return self.client.get_usage_summary()

## 2. Experiment Runner

In [None]:
class ExperimentRunner:
    """Run and track experiments with detailed logging."""

    def __init__(self, client: VerboseClient):
        self.client = client
        self.experiments: list[ExperimentRun] = []

    def run(
        self,
        name: str,
        messages: list[dict],
        expected: str | None = None,
        model: str | None = None,
        correctness_fn: callable = None,
    ) -> ExperimentRun:
        """Run a single experiment."""
        self.client.clear_history()

        print(f"\n{'#' * 60}")
        print(f"# EXPERIMENT: {name}")
        print(f"{'#' * 60}")

        # Extract input text for recording
        input_text = "\n".join(f"[{m['role']}] {m['content']}" for m in messages)

        # Run the completion
        response = self.client.completion(messages, model=model)

        # Check correctness
        is_correct = None
        if expected is not None:
            if correctness_fn:
                is_correct = correctness_fn(response, expected)
            else:
                # Default: check if expected is in response
                is_correct = expected.lower() in response.lower()

        # Create experiment record
        experiment = ExperimentRun(
            name=name,
            input_text=input_text,
            expected_output=expected,
            calls=self.client.get_history(),
            final_response=response,
            is_correct=is_correct,
        )
        self.experiments.append(experiment)

        # Print summary
        print(f"\n{'=' * 60}")
        print("EXPERIMENT SUMMARY")
        print(f"{'=' * 60}")
        print(f"Total calls: {len(experiment.calls)}")
        if expected:
            status = "CORRECT" if is_correct else "INCORRECT"
            print(f"Expected: {expected}")
            print(f"Status: {status}")

        return experiment

    def run_batch(
        self,
        batch_name: str,
        examples: list[dict],
        system_prompt: str | None = None,
        model: str | None = None,
    ) -> list[ExperimentRun]:
        """Run a batch of experiments.

        Args:
            batch_name: Name for this batch
            examples: List of dicts with 'input' and optionally 'expected'
            system_prompt: Optional system prompt to prepend
            model: Model to use
        """
        print(f"\n{'*' * 60}")
        print(f"* BATCH: {batch_name}")
        print(f"* Examples: {len(examples)}")
        print(f"{'*' * 60}")

        results = []
        for i, ex in enumerate(examples, 1):
            messages = []
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})
            messages.append({"role": "user", "content": ex["input"]})

            result = self.run(
                name=f"{batch_name} #{i}",
                messages=messages,
                expected=ex.get("expected"),
                model=model,
            )
            results.append(result)

        # Batch summary
        correct = sum(1 for r in results if r.is_correct is True)
        incorrect = sum(1 for r in results if r.is_correct is False)
        unknown = sum(1 for r in results if r.is_correct is None)

        print(f"\n{'*' * 60}")
        print(f"* BATCH COMPLETE: {batch_name}")
        print(f"* Correct: {correct} | Incorrect: {incorrect} | Unknown: {unknown}")
        print(f"{'*' * 60}")

        return results

    def summary(self) -> dict:
        """Get summary of all experiments."""
        return {
            "total": len(self.experiments),
            "correct": sum(1 for e in self.experiments if e.is_correct is True),
            "incorrect": sum(1 for e in self.experiments if e.is_correct is False),
            "unknown": sum(1 for e in self.experiments if e.is_correct is None),
            "total_calls": sum(len(e.calls) for e in self.experiments),
        }

## 3. Setup

In [None]:
# Initialize clients
base_client = OllamaClient(model_name="llama3.2:3b")
client = VerboseClient(base_client, verbose=True)
runner = ExperimentRunner(client)

print("Available models:")
for m in client.list_models():
    print(f"  - {m}")

## 4. Example: Simple Math

In [None]:
# Simple math examples
math_examples = [
    {"input": "What is 2 + 2?", "expected": "4"},
    {"input": "What is 15 * 3?", "expected": "45"},
    {"input": "What is 100 / 4?", "expected": "25"},
]

runner.run_batch(
    "Simple Math",
    math_examples,
    system_prompt="You are a math assistant. Answer with just the number.",
)

## 5. Example: Rule Extraction (Email)

In [None]:
EMAIL_PROMPT = """Given these examples, what rule determines if an email is valid?

Valid:
- user@example.com
- john.doe@company.org

Invalid:
- userexample.com
- user@.com

Explain the rule briefly."""

result = runner.run(
    name="Email Rule Extraction", messages=[{"role": "user", "content": EMAIL_PROMPT}]
)

## 6. Example: GSM8K Word Problems

In [None]:
# Load GSM8K
gsm8k = load_dataset(DATA.GSM8K, "train")

# Take first 3 examples
gsm_examples = []
for item in list(gsm8k)[:3]:
    # Extract the final numerical answer
    answer_line = (
        item.answer.split("####")[-1].strip()
        if "####" in item.answer
        else item.answer.split()[-1]
    )
    gsm_examples.append({"input": item.question, "expected": answer_line})

print("GSM8K Examples:")
for i, ex in enumerate(gsm_examples, 1):
    print(f"\n{i}. {ex['input'][:100]}...")
    print(f"   Expected: {ex['expected']}")

In [None]:
# Run GSM8K batch
runner.run_batch(
    "GSM8K Math",
    gsm_examples,
    system_prompt="Solve this math problem step by step. End with 'The answer is X' where X is a number.",
)

## 7. Example: CoLA Grammar

In [None]:
# Load CoLA
cola = load_dataset(DATA.COLA, "train")

# Take a few examples
cola_examples = []
for row in cola.data.head(6).iter_rows(named=True):
    label = "grammatical" if row["label"] == 1 else "ungrammatical"
    cola_examples.append(
        {
            "input": f'Is this sentence grammatically correct? "{row["text"]}"',
            "expected": label,
        }
    )

print("CoLA Examples:")
for ex in cola_examples:
    print(f"  {ex['input'][:60]}... -> {ex['expected']}")

In [None]:
# Run CoLA batch
runner.run_batch(
    "CoLA Grammar",
    cola_examples,
    system_prompt="You are a grammar expert. Answer with 'grammatical' or 'ungrammatical' only.",
)

## 8. Compare Models

In [None]:
# Test same prompt with different models
TEST_PROMPT = "What is the capital of France? Answer in one word."

models_to_test = ["llama3.2:3b"]  # Add more: "qwen2.5-coder:32b", "llama3.3:70b"

for model in models_to_test:
    if model in client.list_models():
        runner.run(
            name=f"Capital Test ({model})",
            messages=[{"role": "user", "content": TEST_PROMPT}],
            expected="Paris",
            model=model,
        )

## 9. Summary

In [None]:
summary = runner.summary()
print("\n" + "=" * 60)
print("FINAL SUMMARY")
print("=" * 60)
print(f"Total experiments: {summary['total']}")
print(f"Correct: {summary['correct']}")
print(f"Incorrect: {summary['incorrect']}")
print(f"Unknown: {summary['unknown']}")
print(f"Total LLM calls: {summary['total_calls']}")

if summary["total"] > 0:
    known = summary["correct"] + summary["incorrect"]
    if known > 0:
        accuracy = summary["correct"] / known * 100
        print(f"Accuracy: {accuracy:.1f}%")

## 10. Inspect Specific Experiment

In [None]:
def inspect_experiment(exp: ExperimentRun):
    """Display detailed view of an experiment."""
    print(f"\n{'=' * 60}")
    print(f"EXPERIMENT: {exp.name}")
    print(f"{'=' * 60}")
    print(f"\nCalls made: {len(exp.calls)}")
    print(f"Correct: {exp.is_correct}")
    print(f"Expected: {exp.expected_output}")

    for call in exp.calls:
        print(f"\n{'─' * 40}")
        print(f"Call #{call.call_id} | Model: {call.model}")
        print(f"Tokens: {call.prompt_tokens} in / {call.completion_tokens} out")
        print("\nMESSAGES:")
        for msg in call.messages:
            print(
                f"  [{msg['role']}]: {msg['content'][:200]}..."
                if len(msg["content"]) > 200
                else f"  [{msg['role']}]: {msg['content']}"
            )
        print("\nRESPONSE:")
        print(
            f"  {call.response[:500]}..."
            if len(call.response) > 500
            else f"  {call.response}"
        )


# Inspect last experiment
if runner.experiments:
    inspect_experiment(runner.experiments[-1])

In [None]:
# List all experiments for selection
print("All experiments:")
for i, exp in enumerate(runner.experiments):
    status = "" if exp.is_correct is None else (" OK" if exp.is_correct else " WRONG")
    print(f"  {i}: {exp.name}{status}")

In [None]:
# Inspect a specific experiment by index
# Change the index to inspect different experiments
idx = 0
if idx < len(runner.experiments):
    inspect_experiment(runner.experiments[idx])

## 11. Save Results

In [None]:
def export_experiments(experiments: list[ExperimentRun], filepath: str):
    """Export experiments to JSON."""
    data = []
    for exp in experiments:
        data.append(
            {
                "name": exp.name,
                "input": exp.input_text,
                "expected": exp.expected_output,
                "response": exp.final_response,
                "is_correct": exp.is_correct,
                "calls": [
                    {
                        "call_id": c.call_id,
                        "model": c.model,
                        "messages": c.messages,
                        "response": c.response,
                        "prompt_tokens": c.prompt_tokens,
                        "completion_tokens": c.completion_tokens,
                    }
                    for c in exp.calls
                ],
            }
        )

    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, "w") as f:
        json.dump(data, f, indent=2)
    print(f"Exported {len(experiments)} experiments to {filepath}")


# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
export_experiments(runner.experiments, f"debug_runs/experiments_{timestamp}.json")

## 12. Custom Experiment

Use this cell to run your own custom experiments.

In [None]:
# Custom experiment - modify as needed
custom_messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Your prompt here"},
]

# result = runner.run(
#     name="Custom Test",
#     messages=custom_messages,
#     expected=None,  # Set expected value if you want to check correctness
#     model="llama3.2:3b"  # Or try "qwen2.5-coder:32b", "llama3.3:70b"
# )