# StarCoder Multi-Agent System - Result Analysis

This notebook focuses on analyzing and interpreting results from the StarCoder Multi-Agent System. You'll learn how to:

1. Parse and analyze generation results
2. Extract insights from code reviews
3. Identify patterns in quality metrics
4. Create comprehensive reports
5. Compare different approaches

## Prerequisites

- Complete Basic and Advanced tutorials
- Results from previous runs
- Analysis libraries: `pandas`, `matplotlib`, `seaborn`, `plotly`


In [None]:
# Import analysis libraries
import asyncio
import sys
import json
import re
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
from typing import List, Dict, Any, Optional

# Add project root to Python path
sys.path.insert(0, str(Path.cwd().parent))

from orchestrator import process_simple_task
from communication.message_schema import OrchestratorRequest

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Analysis libraries imported successfully!")
print(f"📁 Working directory: {Path.cwd()}")
print(f"🐍 Python version: {sys.version}")


## 1. Generate Sample Data for Analysis

Let's create a diverse set of tasks to analyze different aspects of code generation.


In [None]:
# Define diverse tasks for comprehensive analysis
analysis_tasks = [
    {
        "category": "Algorithms",
        "description": "Implement a quicksort algorithm",
        "requirements": ["Use recursive approach", "Include type hints", "Handle edge cases"],
        "expected_complexity": "medium"
    },
    {
        "category": "Data Structures",
        "description": "Create a binary search tree implementation",
        "requirements": ["Include insert, delete, search operations", "Add tree traversal methods", "Handle balancing"],
        "expected_complexity": "high"
    },
    {
        "category": "Web Development",
        "description": "Build a REST API endpoint for user authentication",
        "requirements": ["Use FastAPI", "Include JWT tokens", "Add password hashing", "Include validation"],
        "expected_complexity": "medium"
    },
    {
        "category": "Data Processing",
        "description": "Create a CSV data processor with filtering and aggregation",
        "requirements": ["Use pandas", "Handle missing data", "Add data validation", "Include error handling"],
        "expected_complexity": "medium"
    },
    {
        "category": "Machine Learning",
        "description": "Implement a simple linear regression model",
        "requirements": ["Use numpy", "Include gradient descent", "Add model evaluation", "Include visualization"],
        "expected_complexity": "high"
    },
    {
        "category": "Utilities",
        "description": "Create a file organizer that sorts files by type",
        "requirements": ["Use pathlib", "Handle different file types", "Add logging", "Include progress tracking"],
        "expected_complexity": "low"
    }
]

print(f"📝 Analyzing {len(analysis_tasks)} diverse tasks:")
for i, task in enumerate(analysis_tasks, 1):
    print(f"   {i}. [{task['category']}] {task['description']}")

# Process all tasks
print("\n⏳ Processing tasks for analysis...")
results = []

for i, task_data in enumerate(analysis_tasks):
    print(f"   Processing task {i+1}/{len(analysis_tasks)}: {task_data['description'][:50]}...")
    
    try:
        result = await process_simple_task(
            task_description=task_data["description"],
            language="python",
            requirements=task_data["requirements"]
        )
        
        # Add metadata for analysis
        if result.success:
            result.category = task_data["category"]
            result.expected_complexity = task_data["expected_complexity"]
            result.task_description = task_data["description"]
        
        results.append(result)
        
    except Exception as e:
        print(f"   ❌ Task {i+1} failed: {e}")
        results.append(None)

print(f"✅ Processing completed!")


In [None]:
# Extract data for analysis
analysis_data = []

for i, result in enumerate(results):
    if result and result.success:
        # Extract code metrics
        code = result.generation_result.generated_code
        tests = result.generation_result.tests
        
        # Basic metrics
        code_lines = len(code.split('\n'))
        test_lines = len(tests.split('\n'))
        total_lines = code_lines + test_lines
        
        # Code complexity indicators
        function_count = len(re.findall(r'def\s+\w+', code))
        class_count = len(re.findall(r'class\s+\w+', code))
        import_count = len(re.findall(r'^(import|from)\s+', code, re.MULTILINE))
        
        # Quality metrics
        has_docstrings = '"""' in code or "'''" in code
        has_type_hints = ': ' in code or ' -> ' in code
        has_error_handling = 'try:' in code or 'except' in code
        
        analysis_data.append({
            'category': result.category,
            'task_description': result.task_description,
            'expected_complexity': result.expected_complexity,
            'actual_complexity': result.generation_result.metadata.complexity,
            'quality_score': result.review_result.code_quality_score,
            'workflow_time': result.workflow_time,
            'tokens_used': result.generation_result.tokens_used,
            'code_lines': code_lines,
            'test_lines': test_lines,
            'total_lines': total_lines,
            'function_count': function_count,
            'class_count': class_count,
            'import_count': import_count,
            'has_docstrings': has_docstrings,
            'has_type_hints': has_type_hints,
            'has_error_handling': has_error_handling,
            'pep8_score': result.review_result.metrics.get('pep8_score', 0),
            'test_coverage': result.review_result.metrics.get('test_coverage', 'unknown'),
            'issues_count': len(result.review_result.issues),
            'recommendations_count': len(result.review_result.recommendations)
        })

print(f"📊 Extracted data for {len(analysis_data)} successful tasks")
print(f"❌ {len(results) - len(analysis_data)} tasks failed")

# Create DataFrame for analysis
df = pd.DataFrame(analysis_data)

if not df.empty:
    print(f"\n📋 Analysis Dataset Overview:")
    print(f"• Categories: {df['category'].nunique()}")
    print(f"• Average quality score: {df['quality_score'].mean():.1f}/10")
    print(f"• Average workflow time: {df['workflow_time'].mean():.2f}s")
    print(f"• Average lines of code: {df['total_lines'].mean():.0f}")
    print(f"• Total tokens used: {df['tokens_used'].sum()}")
else:
    print("❌ No data available for analysis")


In [None]:
# Create comprehensive analysis visualizations
if not df.empty:
    # Create subplots
    fig = make_subplots(
        rows=3, cols=3,
        subplot_titles=[
            'Quality Score by Category', 'Workflow Time by Category',
            'Code Lines by Category', 'Complexity Distribution',
            'Quality vs Time', 'Code Metrics',
            'PEP8 Scores', 'Token Usage', 'Success Rate by Category'
        ],
        specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}],
               [{"type": "pie"}, {"type": "scatter"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "bar"}, {"type": "bar"}]]
    )
    
    # Quality score by category
    quality_by_category = df.groupby('category')['quality_score'].mean().reset_index()
    fig.add_trace(
        go.Bar(x=quality_by_category['category'], y=quality_by_category['quality_score'],
               name='Quality Score', marker_color='lightblue'),
        row=1, col=1
    )
    
    # Workflow time by category
    time_by_category = df.groupby('category')['workflow_time'].mean().reset_index()
    fig.add_trace(
        go.Bar(x=time_by_category['category'], y=time_by_category['workflow_time'],
               name='Workflow Time', marker_color='lightcoral'),
        row=1, col=2
    )
    
    # Code lines by category
    lines_by_category = df.groupby('category')['total_lines'].mean().reset_index()
    fig.add_trace(
        go.Bar(x=lines_by_category['category'], y=lines_by_category['total_lines'],
               name='Total Lines', marker_color='lightgreen'),
        row=1, col=3
    )
    
    # Complexity distribution
    complexity_counts = df['actual_complexity'].value_counts()
    fig.add_trace(
        go.Pie(labels=complexity_counts.index, values=complexity_counts.values,
               name='Complexity'),
        row=2, col=1
    )
    
    # Quality vs Time scatter
    fig.add_trace(
        go.Scatter(x=df['workflow_time'], y=df['quality_score'],
                   mode='markers', name='Quality vs Time',
                   text=df['category'], marker=dict(size=10, color=df['tokens_used'],
                   colorscale='Viridis', showscale=True)),
        row=2, col=2
    )
    
    # Code metrics
    metrics_data = [
        df['has_docstrings'].sum(),
        df['has_type_hints'].sum(),
        df['has_error_handling'].sum()
    ]
    fig.add_trace(
        go.Bar(x=['Docstrings', 'Type Hints', 'Error Handling'], y=metrics_data,
               name='Code Features', marker_color='gold'),
        row=2, col=3
    )
    
    # PEP8 scores
    fig.add_trace(
        go.Bar(x=df['category'], y=df['pep8_score'],
               name='PEP8 Score', marker_color='purple'),
        row=3, col=1
    )
    
    # Token usage
    fig.add_trace(
        go.Bar(x=df['category'], y=df['tokens_used'],
               name='Token Usage', marker_color='orange'),
        row=3, col=2
    )
    
    # Success rate by category (assuming all tasks in df succeeded)
    success_by_category = df.groupby('category').size().reset_index(name='count')
    fig.add_trace(
        go.Bar(x=success_by_category['category'], y=success_by_category['count'],
               name='Task Count', marker_color='pink'),
        row=3, col=3
    )
    
    # Update layout
    fig.update_layout(
        title_text="Comprehensive Code Generation Analysis",
        title_x=0.5,
        height=1200,
        showlegend=False
    )
    
    fig.show()
    
    # Display summary statistics
    print("\n📊 Summary Statistics:")
    print("=" * 50)
    print(f"• Average Quality Score: {df['quality_score'].mean():.2f} ± {df['quality_score'].std():.2f}")
    print(f"• Average Workflow Time: {df['workflow_time'].mean():.2f}s ± {df['workflow_time'].std():.2f}s")
    print(f"• Average Code Lines: {df['total_lines'].mean():.0f} ± {df['total_lines'].std():.0f}")
    print(f"• Average Functions: {df['function_count'].mean():.1f} ± {df['function_count'].std():.1f}")
    print(f"• Average Classes: {df['class_count'].mean():.1f} ± {df['class_count'].std():.1f}")
    print(f"• Docstring Coverage: {df['has_docstrings'].mean()*100:.1f}%")
    print(f"• Type Hint Coverage: {df['has_type_hints'].mean()*100:.1f}%")
    print(f"• Error Handling Coverage: {df['has_error_handling'].mean()*100:.1f}%")
else:
    print("❌ No data available for visualization")
