# LLM Reasoning Framework Comparison

**Objective:** Compare ReAct, Chain-of-Thought, and Tree-of-Thoughts frameworks across three distinct task types.

**Experiment Design:**
- 1 task per domain (code, planning, structuring)
- 3 frameworks √ó 3 runs = 9 experiments total
- Automated evaluation + manual response analysis

## Setup and Configuration

In [None]:
# Essential imports
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv

# Project imports
import sys
sys.path.append('.')
from agents import AgentFactory
from tasks import TaskGenerator, TaskValidator
from utils import LLMManager, ExperimentResult

load_dotenv()
print("‚úÖ Environment loaded")

In [None]:
# Check API configuration
api_status = {
    'Google': '‚úÖ' if os.getenv('GOOGLE_API_KEY') and not os.getenv('GOOGLE_API_KEY').startswith('your_') else '‚ùå'
}

print("API Configuration:")
for provider, status in api_status.items():
    print(f"  {provider}: {status}")

# Configuration
MODEL = os.getenv('DEFAULT_MODEL')
DEMO_MODE = True  # Set to False for real API calls

print(f"\nModel: {MODEL}")
print(f"Demo Mode: {'ON' if DEMO_MODE else 'OFF'}")

## Task Definitions

Three carefully selected tasks representing different cognitive demands:

In [None]:
# Load and display our three tasks
task_generator = TaskGenerator()
all_tasks = task_generator.get_all_tasks()

print("üìã EXPERIMENT TASKS")
print("=" * 50)

for task_type, tasks in all_tasks.items():
    task = tasks[0]  # We have exactly 1 task per type
    print(f"\nüéØ {task_type.replace('_', ' ').title()}")
    print(f"   ID: {task.id}")
    print(f"   Title: {task.title}")
    print(f"   Prompt: {task.prompt[:100]}...")
    print(f"   Criteria: {len(task.validation_criteria)} validation points")

print(f"\n‚úÖ Total: {sum(len(tasks) for tasks in all_tasks.values())} tasks loaded")

## Reasoning Frameworks

**ReAct:** Combines reasoning and action in iterative cycles  
**Chain-of-Thought:** Sequential step-by-step logical reasoning  
**Tree-of-Thoughts:** Explores multiple reasoning branches

In [None]:
# Initialize frameworks
frameworks = AgentFactory.get_available_frameworks()
llm_manager = LLMManager()

print("üß† REASONING FRAMEWORKS")
print("=" * 50)
for framework in frameworks:
    print(f"‚úÖ {framework.upper()}")

print(f"\nüéØ Experiment Design: {len(frameworks)} frameworks √ó 1 task per type √ó 3 runs = {len(frameworks) * 3 * 3} total experiments")

## Experiment Execution

In [None]:
def run_experiment(demo_mode=True, runs_per_task=3):
    """Run the complete experiment."""
    results = []
    
    if demo_mode:
        print("üî∏ DEMO MODE: Generating mock results (no API calls)")
        
        # Generate realistic mock data
        for task_type, tasks in all_tasks.items():
            task = tasks[0]
            for framework in frameworks:
                for run in range(runs_per_task):
                    # Realistic mock metrics
                    base_scores = {'react': 80, 'cot': 75, 'tot': 85}
                    base_times = {'react': 2.5, 'cot': 1.8, 'tot': 3.2}
                    base_tokens = {'react': 850, 'cot': 650, 'tot': 1200}
                    
                    score = base_scores[framework] + (run * 2) + (hash(task.id) % 10 - 5)
                    
                    result = ExperimentResult(
                        timestamp=datetime.now().isoformat(),
                        framework=framework,
                        task_id=task.id,
                        task_type=task_type,
                        run_number=run + 1,
                        success=True,
                        tokens_used=base_tokens[framework] + (run * 50),
                        execution_time=base_times[framework] + (run * 0.3),
                        memory_usage=8.5 + (run * 0.2),
                        reasoning_steps=5 + run,
                        final_answer=f"Mock {framework.upper()} solution for {task.title} (Run {run+1}): This is a comprehensive response demonstrating the framework's approach...",
                        intermediate_steps=[f"Step {i+1}: {framework} reasoning step" for i in range(3+run)],
                        validation_score=max(60, min(100, score)),
                        validation_passed=score >= 70,
                        validation_issues=[] if score >= 70 else ["Mock validation issue"],
                        error_message=None
                    )
                    results.append(result)
    else:
        print("üî¥ LIVE MODE: Making real API calls")
        # Import and use the real experiment runner
        from run_experiment import ExperimentRunner
        runner = ExperimentRunner(model_name=MODEL, runs_per_task=runs_per_task)
        results = runner.run_framework_comparison()
    
    print(f"\n‚úÖ Experiment completed: {len(results)} results")
    return results

# Run the experiment
experiment_results = run_experiment(demo_mode=DEMO_MODE)

## Results Analysis

In [None]:
# Convert to DataFrame for analysis
df = pd.DataFrame([{
    'framework': r.framework,
    'task_type': r.task_type,
    'task_id': r.task_id,
    'run': r.run_number,
    'success': r.success,
    'score': r.validation_score,
    'time': r.execution_time,
    'tokens': r.tokens_used,
    'steps': r.reasoning_steps
} for r in experiment_results])

print("üìä RESULTS SUMMARY")
print("=" * 50)
print(f"Total experiments: {len(df)}")
print(f"Success rate: {df['success'].mean():.1%}")
print(f"Average score: {df['score'].mean():.1f}/100")
print(f"Average time: {df['time'].mean():.1f}s")
print(f"Average tokens: {df['tokens'].mean():.0f}")

df.head()

In [None]:
# Framework comparison
framework_stats = df.groupby('framework').agg({
    'score': ['mean', 'std'],
    'time': 'mean',
    'tokens': 'mean',
    'success': 'mean'
}).round(2)

print("üèÜ FRAMEWORK COMPARISON")
print("=" * 50)
framework_stats

In [None]:
# Task type analysis
task_stats = df.groupby('task_type').agg({
    'score': ['mean', 'std'],
    'time': 'mean',
    'tokens': 'mean'
}).round(2)

print("üìã TASK TYPE ANALYSIS")
print("=" * 50)
task_stats

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('LLM Reasoning Framework Comparison', fontsize=16)

# Score comparison
sns.boxplot(data=df, x='framework', y='score', ax=axes[0,0])
axes[0,0].set_title('Validation Scores by Framework')
axes[0,0].set_ylabel('Score (0-100)')

# Time comparison
sns.barplot(data=df, x='framework', y='time', ax=axes[0,1])
axes[0,1].set_title('Execution Time by Framework')
axes[0,1].set_ylabel('Time (seconds)')

# Task type performance
sns.heatmap(df.pivot_table(values='score', index='framework', columns='task_type', aggfunc='mean'), 
            annot=True, cmap='RdYlGn', ax=axes[1,0])
axes[1,0].set_title('Average Score by Framework & Task')

# Token usage
sns.scatterplot(data=df, x='tokens', y='score', hue='framework', size='time', ax=axes[1,1])
axes[1,1].set_title('Score vs Token Usage')
axes[1,1].set_xlabel('Tokens Used')
axes[1,1].set_ylabel('Score')

plt.tight_layout()
plt.show()

## Detailed Response Analysis

Examine actual LLM outputs for qualitative insights:

In [None]:
# Show detailed responses for each framework on each task
print("üîç DETAILED RESPONSE ANALYSIS")
print("=" * 60)

for task_type in df['task_type'].unique():
    print(f"\nüìã Task: {task_type.replace('_', ' ').title()}")
    print("-" * 40)
    
    task_results = [r for r in experiment_results if r.task_type == task_type]
    
    # Show best run for each framework
    for framework in frameworks:
        framework_results = [r for r in task_results if r.framework == framework]
        best_result = max(framework_results, key=lambda x: x.validation_score)
        
        print(f"\nüß† {framework.upper()} (Score: {best_result.validation_score:.0f}/100)")
        
        # Show response preview
        response_preview = best_result.final_answer[:200] + "..." if len(best_result.final_answer) > 200 else best_result.final_answer
        print(f"Response: {response_preview}")
        
        # Show reasoning steps
        if best_result.intermediate_steps:
            print(f"Reasoning Steps: {len(best_result.intermediate_steps)}")
            for i, step in enumerate(best_result.intermediate_steps[:2]):  # Show first 2 steps
                print(f"  {i+1}. {step[:80]}...")
        
        # Show any issues
        if best_result.validation_issues:
            print(f"Issues: {', '.join(best_result.validation_issues)}")
    
    print("\n" + "=" * 60)

## Key Insights & Conclusions

In [None]:
# Generate insights
print("üí° KEY INSIGHTS")
print("=" * 50)

# Best performing framework overall
best_framework = df.groupby('framework')['score'].mean().idxmax()
best_score = df.groupby('framework')['score'].mean().max()
print(f"üèÜ Best Overall Framework: {best_framework.upper()} (avg score: {best_score:.1f})")

# Most challenging task
hardest_task = df.groupby('task_type')['score'].mean().idxmin()
hardest_score = df.groupby('task_type')['score'].mean().min()
print(f"üéØ Most Challenging Task: {hardest_task.replace('_', ' ').title()} (avg score: {hardest_score:.1f})")

# Efficiency analysis
efficiency = df.groupby('framework').apply(lambda x: x['score'].mean() / x['time'].mean()).round(2)
most_efficient = efficiency.idxmax()
print(f"‚ö° Most Efficient Framework: {most_efficient.upper()} (score/time ratio: {efficiency.max():.1f})")

# Consistency analysis
consistency = df.groupby('framework')['score'].std()
most_consistent = consistency.idxmin()
print(f"üìä Most Consistent Framework: {most_consistent.upper()} (std dev: {consistency.min():.1f})")

print("\nüìà PERFORMANCE MATRIX:")
performance_matrix = df.pivot_table(values='score', index='framework', columns='task_type', aggfunc='mean').round(1)
print(performance_matrix)

print("\nüî¨ STATISTICAL SUMMARY:")
print(f"‚Ä¢ Score range: {df['score'].min():.1f} - {df['score'].max():.1f}")
print(f"‚Ä¢ Time range: {df['time'].min():.1f}s - {df['time'].max():.1f}s")
print(f"‚Ä¢ Token range: {df['tokens'].min():.0f} - {df['tokens'].max():.0f}")
print(f"‚Ä¢ Overall success rate: {df['success'].mean():.1%}")

## Save Results

In [None]:
# Save experiment results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)

# Save summary CSV
csv_file = results_dir / f"experiment_summary_{timestamp}.csv"
df.to_csv(csv_file, index=False)

# Save detailed JSON
json_file = results_dir / f"detailed_results_{timestamp}.json"
detailed_data = [{
    'timestamp': r.timestamp,
    'framework': r.framework,
    'task_id': r.task_id,
    'task_type': r.task_type,
    'run_number': r.run_number,
    'success': r.success,
    'validation_score': r.validation_score,
    'execution_time': r.execution_time,
    'tokens_used': r.tokens_used,
    'reasoning_steps': r.reasoning_steps,
    'final_answer': r.final_answer,
    'intermediate_steps': r.intermediate_steps,
    'validation_issues': r.validation_issues,
    'error_message': r.error_message
} for r in experiment_results]

with open(json_file, 'w') as f:
    json.dump(detailed_data, f, indent=2)

print(f"‚úÖ Results saved:")
print(f"   Summary: {csv_file}")
print(f"   Detailed: {json_file}")
print(f"\nüéØ Experiment complete! Check the results directory for full data.")