# EdgePrompt Runner Debug Notebook

This notebook provides an interactive environment for debugging and exploring the EdgePrompt research framework. It demonstrates how to load results, analyze metrics, and visualize findings.

In [None]:
# Setup
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add the parent directory to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Set up visualization defaults
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_theme(style="whitegrid")

# Import EdgePrompt modules
from research.runner.runner_core import RunnerCore
from research.runner.config_loader import ConfigLoader
from research.runner.model_manager import ModelManager
from research.runner.template_engine import TemplateEngine

## Configuration Exploration

First, let's explore the available configurations for test suites, hardware profiles, and models.

In [None]:
# Load hardware profiles
hardware_path = '../configs/hardware_profiles.json'
with open(hardware_path, 'r') as f:
    hardware_profiles = json.load(f)

# Display hardware profiles
profiles_df = pd.json_normalize(hardware_profiles)
profiles_df[['profile_id', 'description', 'simulation_config.max_memory_mb', 'simulation_config.max_cores']]

In [None]:
# Load model configurations
models_path = '../configs/model_configs.json'
with open(models_path, 'r') as f:
    model_configs = json.load(f)

# Display model configurations
models_df = pd.json_normalize(model_configs)
models_df[['model_id', 'base_model', 'quantization', 'context_window']]

In [None]:
# Load a test suite configuration
test_suite_path = '../configs/test_suites/multi_stage_validation.json'
with open(test_suite_path, 'r') as f:
    test_suite = json.load(f)

# Display test suite details
print(f"Test Suite: {test_suite['test_suite_id']}")
print(f"Description: {test_suite['description']}")
print(f"Templates: {test_suite['templates']}")
print(f"Models: {test_suite['models']}")
print(f"Hardware profiles: {test_suite['hardware_profiles']}")
print(f"Number of test cases: {len(test_suite['test_cases'])}")

## Interactive Template Processing

Now, let's explore how templates are processed with variables.

In [None]:
# Initialize the template engine
template_engine = TemplateEngine(template_dir='../configs/templates')

# Load a template
template_name = 'validation_template'
template = template_engine.load_template(template_name)

# Display template metadata
print(f"Template ID: {template['id']}")
print(f"Template Type: {template['type']}")
print(f"\nConstraints:")
for constraint in template['constraints']:
    print(f"- {constraint}")

# Extract variables
variables = template_engine.extract_template_variables(template)
print(f"\nVariables in template: {variables}")

In [None]:
# Process a template with example variables
example_variables = {
    'question': 'What are the main characteristics of rainforests?',
    'answer': 'Rainforests have high rainfall, rich biodiversity, warm temperatures, and lush vegetation.'
}

processed_prompt = template_engine.process_template(template, example_variables)
print("\nProcessed Prompt:")
print(processed_prompt)

## Simulated Test Execution

Let's run a simulated test execution with the framework components.

In [None]:
# Initialize the model manager
model_manager = ModelManager()

# Get model info
model_id = 'gemma-3-1b-edge'
try:
    model_info = model_manager.get_model_info(model_id)
    print(f"Model info for {model_id}:")
    for key, value in model_info.items():
        print(f"- {key}: {value}")
except Exception as e:
    print(f"Error getting model info: {str(e)}")

In [None]:
# Import remaining components
from research.runner.test_executor import TestExecutor
from research.runner.metrics_collector import MetricsCollector
from research.runner.evaluation_engine import EvaluationEngine

# Initialize components
test_executor = TestExecutor()
metrics_collector = MetricsCollector()
evaluation_engine = EvaluationEngine()

In [None]:
# Run a simulated test
try:
    # Initialize model
    model = model_manager.initialize_model(model_id)
    
    # Start metrics collection
    metrics_collector.start_collection()
    
    # Execute test
    test_result = test_executor.execute_test(model, processed_prompt)
    
    # Stop metrics collection
    metrics = metrics_collector.stop_collection()
    
    # Display results
    print(f"Test execution complete in {test_result['execution_time_ms']}ms")
    print(f"Output: {test_result['output']}")
    print("\nMetrics:")
    for key, value in metrics.items():
        if isinstance(value, (int, float)):
            print(f"- {key}: {value:.2f}")
        else:
            print(f"- {key}: {value}")
            
except Exception as e:
    print(f"Error running test: {str(e)}")

## Result Analysis

If you've already run some experiments, let's analyze the results.

In [None]:
# Load results from a JSONL file if it exists
results_path = '../data/raw/multi_stage_validation/all_results.jsonl'
results = []

if os.path.exists(results_path):
    with open(results_path, 'r') as f:
        for line in f:
            try:
                results.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"Error decoding line in {results_path}")
    
    print(f"Loaded {len(results)} results from {results_path}")
    
    # Convert to DataFrame
    results_df = pd.json_normalize(results)
    
    # Display a sample
    if not results_df.empty:
        print("Sample of results:")
        display(results_df[['model_id', 'hardware_profile', 'test_case_id', 'metrics.execution_time_ms', 'validation_result.isValid']].head())
    else:
        print("No results found.")
else:
    print(f"Results file not found: {results_path}")

In [None]:
# If results are available, create a visualization
if 'results_df' in locals() and not results_df.empty:
    try:
        # Group by model and hardware profile
        performance = results_df.groupby(['model_id', 'hardware_profile']).agg({
            'metrics.execution_time_ms': 'mean',
            'metrics.memory_usage_mb': 'mean',
            'validation_result.isValid': 'mean'
        }).reset_index()
        
        # Rename columns
        performance.columns = ['model_id', 'hardware_profile', 'execution_time_ms', 'memory_usage_mb', 'validation_success_rate']
        
        # Convert validation success rate to percentage
        performance['validation_success_rate'] = performance['validation_success_rate'] * 100
        
        # Create a grouped bar chart for execution time
        plt.figure(figsize=(12, 6))
        ax = sns.barplot(x='hardware_profile', y='execution_time_ms', hue='model_id', data=performance)
        plt.title('Execution Time by Hardware Profile and Model')
        plt.xlabel('Hardware Profile')
        plt.ylabel('Execution Time (ms)')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # Create a scatter plot for resource-performance tradeoff
        plt.figure(figsize=(10, 6))
        ax = sns.scatterplot(x='memory_usage_mb', y='execution_time_ms', hue='model_id', 
                            size='validation_success_rate', sizes=(50, 250), alpha=0.7, data=performance)
        plt.title('Resource-Performance Tradeoff')
        plt.xlabel('Memory Usage (MB)')
        plt.ylabel('Execution Time (ms)')
        
        # Add text labels for hardware profiles
        for i, row in performance.iterrows():
            plt.annotate(row['hardware_profile'], 
                        (row['memory_usage_mb'], row['execution_time_ms']),
                        xytext=(5, 5), textcoords='offset points')
            
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error creating visualization: {str(e)}")

## Manual Runner Execution

Finally, we can directly execute the RunnerCore to run a test suite programmatically.

In [None]:
# Configuration
test_suite_path = '../configs/test_suites/multi_stage_validation.json'
output_dir = '../data/raw/notebook_test'

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Initialize the runner (commented out to prevent accidental execution)
# runner = RunnerCore(config_path=test_suite_path, output_dir=output_dir, log_level="INFO")

# Uncomment to run a test suite
# results = runner.run_test_suite()

print("To run the test suite, uncomment the lines above.")
print(f"Results will be saved to: {output_dir}")

## Conclusion

This notebook demonstrates how to use the EdgePrompt research framework to process templates, execute tests, and analyze results. You can extend it to perform more advanced analysis and visualization based on your specific research needs.