# Experiment 1: Cross-Agent Comparison Analysis

This notebook analyzes MCP performance across different coding agents (goose-cli, claude-code, gemini-cli).

**Objective:** Determine whether agent choice affects MCP retrieval performance.

**See:** `notes/experiment_1_cross_agent_comparison.md` for detailed experimental design.

In [1]:
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

## Load Results from Different Agents

In [2]:
# Define result files for each agent (November 6, 2025 run)
result_files = {
    "claude": "../results/compare_agents/claude_20251106.yaml",  # Claude Code results
    "gemini": "../results/compare_agents/gemini_20251106.yaml",  # Gemini CLI results
    "goose": "../results/compare_agents/goose_20251106.yaml",  # Goose CLI results
}

# Load results
agent_results = {}
for agent, filepath in result_files.items():
    if Path(filepath).exists():
        with open(filepath, "r") as f:
            agent_results[agent] = yaml.safe_load(f)
        print(f"âœ“ Loaded results for {agent}: {filepath}")
    else:
        print(f"âœ— Results not found for {agent}: {filepath}")

print(f"\n{len(agent_results)} agent(s) loaded: {list(agent_results.keys())}")

âœ“ Loaded results for claude: ../results/compare_agents/claude_20251106.yaml


âœ“ Loaded results for gemini: ../results/compare_agents/gemini_20251106.yaml


âœ“ Loaded results for goose: ../results/compare_agents/goose_20251106.yaml

3 agent(s) loaded: ['claude', 'gemini', 'goose']


## Convert to DataFrames

In [3]:
# Convert each agent's results to DataFrame
dfs = {}
for agent, results in agent_results.items():
    df = pd.DataFrame(results["results"])
    df = df.explode("servers")  # Expand so each server gets its own row
    df["MCP"] = df["servers"]
    df["agent"] = agent  # Add agent identifier
    dfs[agent] = df

# Combine all results into single DataFrame
if dfs:
    df_combined = pd.concat(dfs.values(), ignore_index=True)
    print(f"Combined dataset shape: {df_combined.shape}")
    print(f"\nAgents: {df_combined['agent'].unique()}")
    print(f"MCPs: {df_combined['MCP'].unique()}")
    print(f"Case groups: {df_combined['case_group'].unique()}")
else:
    print("No results loaded yet. Run experiments first.")

Combined dataset shape: (300, 16)

Agents: ['claude' 'gemini' 'goose']
MCPs: ['artl' 'simple-pubmed' 'biomcp' 'pubmed-mcp']
Case groups: ['Text extraction' 'Metadata' 'Summarization'
 'Table / Figure / Figure Legend extraction' 'Supplementary material'
 'Publication status']


## Figure 1: MCP Performance Across Coding Agents

4 MCPs Ã— 3 agents (goose-cli, claude-code, gemini-cli) - Overall pass rate for each combination

In [None]:
if "df_combined" in locals():
    # Create violin plot for score distributions by agent and MCP
    fig, ax = plt.subplots(figsize=(16, 7))

    mcps = sorted(df_combined["MCP"].unique())
    agents = sorted(df_combined["agent"].unique())
    
    colors = {'claude': '#1f77b4', 'gemini': '#ff7f0e', 'goose': '#2ca02c'}
    
    # Prepare data for violin plots
    positions = []
    data_to_plot = []
    labels = []
    violin_colors = []
    
    pos = 0
    for mcp_idx, mcp in enumerate(mcps):
        for agent_idx, agent in enumerate(agents):
            agent_mcp_data = df_combined[(df_combined["agent"] == agent) & 
                                         (df_combined["MCP"] == mcp)]["score"].dropna()
            if len(agent_mcp_data) > 0:
                data_to_plot.append(agent_mcp_data)
                positions.append(pos)
                labels.append(f"{agent}")
                violin_colors.append(colors.get(agent, '#333333'))
                pos += 1
        pos += 1  # Add gap between MCPs
    
    # Create violin plots
    parts = ax.violinplot(data_to_plot, positions=positions, widths=0.6,
                          showmeans=True, showmedians=True, showextrema=True)
    
    # Color the violins
    for i, pc in enumerate(parts['bodies']):
        pc.set_facecolor(violin_colors[i])
        pc.set_alpha(0.6)
    
    # Customize violin plot elements
    for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians', 'cmeans'):
        if partname in parts:
            parts[partname].set_edgecolor('black')
            parts[partname].set_linewidth(1)
    
    # Overlay strip plot with individual points
    pos = 0
    for mcp_idx, mcp in enumerate(mcps):
        for agent_idx, agent in enumerate(agents):
            agent_mcp_data = df_combined[(df_combined["agent"] == agent) & 
                                         (df_combined["MCP"] == mcp)]["score"].dropna()
            if len(agent_mcp_data) > 0:
                # Add jitter to x position for better visibility
                x_jitter = np.random.normal(pos, 0.08, size=len(agent_mcp_data))
                ax.scatter(x_jitter, agent_mcp_data, 
                          color=colors.get(agent, '#333333'),
                          alpha=0.4, s=20, zorder=3, edgecolors='white', linewidths=0.5)
                pos += 1
        pos += 1
    
    # Add horizontal line for pass threshold
    ax.axhline(y=0.9, color='red', linestyle='--', linewidth=2, alpha=0.6, label='Pass threshold (0.9)')
    
    # Set x-axis labels
    mcp_positions = []
    for mcp_idx, mcp in enumerate(mcps):
        mcp_center = mcp_idx * (len(agents) + 1) + len(agents) / 2 - 0.5
        mcp_positions.append(mcp_center)
    
    ax.set_xticks(mcp_positions)
    ax.set_xticklabels(mcps, fontsize=11)
    ax.set_ylabel("Semantic Similarity Score", fontsize=12, fontweight='bold')
    ax.set_xlabel("MCP Server", fontsize=12, fontweight='bold')
    ax.set_title("Figure 1: MCP Performance Across Coding Agents", fontsize=14, fontweight="bold", pad=20)
    ax.set_ylim(-0.05, 1.05)
    ax.grid(axis="y", alpha=0.3)
    
    # Create custom legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=colors[agent], alpha=0.6, label=agent) 
                      for agent in agents]
    legend_elements.append(plt.Line2D([0], [0], color='red', linestyle='--', linewidth=2, 
                                     label='Pass threshold'))
    ax.legend(handles=legend_elements, title="Agent", title_fontsize=11, fontsize=10, loc='lower right')
    
    plt.tight_layout()
    plt.savefig(
        "../results/figures/fig1_mcp_performance_by_agent.png",
        dpi=300,
        bbox_inches="tight",
    )
    plt.show()

    # Print summary statistics
    print("\nFigure 1 Summary - Score Statistics by MCP and Agent:")
    for mcp in mcps:
        print(f"\n{mcp}:")
        for agent in agents:
            agent_mcp_scores = df_combined[(df_combined["agent"] == agent) & 
                                           (df_combined["MCP"] == mcp)]["score"]
            median = agent_mcp_scores.median()
            pass_rate = (agent_mcp_scores >= 0.9).sum() / len(agent_mcp_scores) * 100
            print(f"  {agent:10s}: median={median:.3f}, pass_rate={pass_rate:.1f}%")

## Figure 2: Performance by Evaluation Type

Breakdown by test categories comparing all 4 MCPs within each category

In [None]:
if "df_combined" in locals():
    # Calculate % passed and counts by case_group and MCP (across all agents)
    category_pass_rates = (
        df_combined.groupby(["case_group", "MCP"])["passed"]
        .agg(['mean', 'sum', 'count'])
        .reset_index()
    )
    category_pass_rates["percent_passed"] = category_pass_rates["mean"] * 100

    # Pivot for heatmap
    heatmap_data = category_pass_rates.pivot(
        index="case_group", columns="MCP", values="percent_passed"
    )
    
    # Create count data for annotations
    count_data = category_pass_rates.copy()
    count_data["count_str"] = count_data.apply(
        lambda x: f"{int(x['sum'])}/{int(x['count'])}", axis=1
    )
    count_pivot = count_data.pivot(
        index="case_group", columns="MCP", values="count_str"
    )

    # Create heatmap with annotations
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Create annotations combining percentage and counts
    annot_data = heatmap_data.copy()
    for i, case_group in enumerate(annot_data.index):
        for j, mcp in enumerate(annot_data.columns):
            pct = heatmap_data.loc[case_group, mcp]
            count_str = count_pivot.loc[case_group, mcp]
            annot_data.loc[case_group, mcp] = f"{pct:.1f}%\n{count_str}"
    
    sns.heatmap(
        heatmap_data,
        annot=annot_data,
        fmt="",
        cmap="RdYlGn",
        vmin=0,
        vmax=100,
        cbar_kws={"label": "% Passed"},
        linewidths=0.5,
        ax=ax,
        annot_kws={"fontsize": 9}
    )
    
    ax.set_title("Figure 2: Performance by Evaluation Type", fontsize=14, fontweight="bold", pad=20)
    ax.set_xlabel("MCP Server", fontsize=12, fontweight='bold')
    ax.set_ylabel("Evaluation Category", fontsize=12, fontweight='bold')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)

    plt.tight_layout()
    plt.savefig(
        "../results/figures/fig2_performance_by_category.png", dpi=300, bbox_inches="tight"
    )
    plt.show()

    print("\nFigure 2 Summary - Pass Rates by Category and MCP:")
    print("\nPercentages:")
    print(heatmap_data.round(1))
    print("\nCounts (passed/total):")
    print(count_pivot)

## Figure 3: Model Comparison

Same agent (goose-cli) tested with different models (gpt-4o, gpt-4o-mini, gpt-5) - How model choice affects MCP performance

In [None]:
# Load model comparison data (goose-cli with different models)
model_files = {
    "gpt-4o-mini": "../results/compare_models/goose_gpt4o_mini_20251105.yaml",
    "gpt-4o": "../results/compare_models/goose_gpt4o_20251104.yaml",
    "gpt-5": "../results/compare_models/goose_gpt5_20251104.yaml",
}

# Load model results
model_results = {}
for model, filepath in model_files.items():
    if Path(filepath).exists():
        with open(filepath, "r") as f:
            model_results[model] = yaml.safe_load(f)
        print(f"âœ“ Loaded {model} results")
    else:
        print(f"âœ— {model} results not found")

# Convert to DataFrames
model_dfs = {}
for model, results in model_results.items():
    df = pd.DataFrame(results["results"])
    df = df.explode("servers")
    df["MCP"] = df["servers"]
    df["model"] = model
    model_dfs[model] = df

# Combine model comparison data
if model_dfs:
    df_models = pd.concat(model_dfs.values(), ignore_index=True)
    
    # Create violin plot with strip overlay for score distributions by model and MCP
    fig, ax = plt.subplots(figsize=(16, 7))

    mcps = sorted(df_models["MCP"].unique())
    models = ["gpt-4o-mini", "gpt-4o", "gpt-5"]  # Order by capability
    
    # Blue gradient: light â†’ medium â†’ dark
    colors = {
        'gpt-4o-mini': '#a6cee3',  # light blue
        'gpt-4o': '#2b8cbe',        # medium blue
        'gpt-5': '#08519c'          # dark blue
    }
    
    # Prepare data for violin plots
    positions = []
    data_to_plot = []
    violin_colors = []
    
    pos = 0
    for mcp_idx, mcp in enumerate(mcps):
        for model_idx, model in enumerate(models):
            model_mcp_data = df_models[(df_models["model"] == model) & 
                                       (df_models["MCP"] == mcp)]["score"].dropna()
            if len(model_mcp_data) > 0:
                data_to_plot.append(model_mcp_data)
                positions.append(pos)
                violin_colors.append(colors.get(model, '#34495e'))
                pos += 1
        pos += 1  # Add gap between MCPs
    
    # Create violin plots
    parts = ax.violinplot(data_to_plot, positions=positions, widths=0.6,
                          showmeans=True, showmedians=True, showextrema=True)
    
    # Color the violins
    for i, pc in enumerate(parts['bodies']):
        pc.set_facecolor(violin_colors[i])
        pc.set_alpha(0.6)
    
    # Customize violin plot elements
    for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians', 'cmeans'):
        if partname in parts:
            parts[partname].set_edgecolor('black')
            parts[partname].set_linewidth(1)
    
    # Overlay strip plot with individual points
    pos = 0
    for mcp_idx, mcp in enumerate(mcps):
        for model_idx, model in enumerate(models):
            model_mcp_data = df_models[(df_models["model"] == model) & 
                                       (df_models["MCP"] == mcp)]["score"].dropna()
            if len(model_mcp_data) > 0:
                # Add jitter to x position for better visibility
                x_jitter = np.random.normal(pos, 0.08, size=len(model_mcp_data))
                ax.scatter(x_jitter, model_mcp_data, 
                          color=colors.get(model, '#34495e'),
                          alpha=0.4, s=20, zorder=3, edgecolors='white', linewidths=0.5)
                pos += 1
        pos += 1
    
    # Add horizontal line for pass threshold
    ax.axhline(y=0.9, color='red', linestyle='--', linewidth=2, alpha=0.6, label='Pass threshold (0.9)')
    
    # Set x-axis labels
    mcp_positions = []
    for mcp_idx, mcp in enumerate(mcps):
        mcp_center = mcp_idx * (len(models) + 1) + len(models) / 2 - 0.5
        mcp_positions.append(mcp_center)
    
    ax.set_xticks(mcp_positions)
    ax.set_xticklabels(mcps, fontsize=11)
    ax.set_ylabel("Semantic Similarity Score", fontsize=12, fontweight='bold')
    ax.set_xlabel("MCP Server", fontsize=12, fontweight='bold')
    ax.set_title("Figure 3: Model Comparison (goose-cli)", fontsize=14, fontweight="bold", pad=20)
    ax.set_ylim(-0.05, 1.05)
    ax.grid(axis="y", alpha=0.3)
    
    # Create custom legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=colors[model], alpha=0.6, label=model) 
                      for model in models]
    legend_elements.append(plt.Line2D([0], [0], color='red', linestyle='--', linewidth=2, 
                                     label='Pass threshold'))
    ax.legend(handles=legend_elements, title="Model", title_fontsize=11, fontsize=10, loc='lower right')
    
    plt.tight_layout()
    plt.savefig(
        "../results/figures/fig3_model_comparison.png",
        dpi=300,
        bbox_inches="tight",
    )
    plt.show()

    # Print summary statistics
    print("\nFigure 3 Summary - Score Statistics by Model and MCP:")
    for mcp in mcps:
        print(f"\n{mcp}:")
        for model in models:
            model_mcp_scores = df_models[(df_models["model"] == model) & 
                                         (df_models["MCP"] == mcp)]["score"]
            if len(model_mcp_scores) > 0:
                median = model_mcp_scores.median()
                pass_rate = (model_mcp_scores >= 0.9).sum() / len(model_mcp_scores) * 100
                print(f"  {model:15s}: median={median:.3f}, pass_rate={pass_rate:.1f}%")
    
    # Overall performance by model
    print("\nOverall Performance by Model:")
    for model in models:
        model_scores = df_models[df_models["model"] == model]["score"]
        median = model_scores.median()
        pass_rate = (model_scores >= 0.9).sum() / len(model_scores) * 100
        print(f"  {model:15s}: median={median:.3f}, pass_rate={pass_rate:.1f}%")
else:
    print("No model comparison data loaded.")

## Summary of Key Findings

Three main findings from the cross-agent and model comparison analysis

In [None]:
print("=" * 80)
print("KEY FINDINGS: Experiment 1 - Cross-Agent and Model Comparison")
print("=" * 80)

if "df_combined" in locals():
    print("\nðŸ“Š FIGURE 1: MCP Performance Across Coding Agents")
    print("-" * 80)
    overall_pass_rates = df_combined.groupby("agent")["passed"].mean() * 100
    print("\nOverall Pass Rates by Agent:")
    for agent in ['claude', 'goose', 'gemini']:
        if agent in overall_pass_rates.index:
            print(f"   â€¢ {agent:12s}: {overall_pass_rates[agent]:5.1f}%")
    
    max_diff = overall_pass_rates.max() - overall_pass_rates.min()
    print(f"\n   â†’ Agent choice significantly affects performance ({max_diff:.1f} pp difference)")
    print(f"   â†’ Claude Code outperforms Gemini CLI and Goose CLI")

if "category_pass_rates" in locals():
    print("\n\nðŸ“Š FIGURE 2: Performance by Evaluation Type")
    print("-" * 80)
    # Find best and worst performing categories
    category_avg = category_pass_rates.groupby("case_group")["percent_passed"].mean().sort_values(ascending=False)
    print("\nCategory Performance (averaged across all MCPs):")
    for cat, rate in category_avg.items():
        print(f"   â€¢ {cat:45s}: {rate:5.1f}%")
    
    print(f"\n   â†’ Best performing: {category_avg.index[0]}")
    print(f"   â†’ Most challenging: {category_avg.index[-1]}")

if "df_models" in locals():
    print("\n\nðŸ“Š FIGURE 3: Model Comparison")
    print("-" * 80)
    overall_model_rates = df_models.groupby("model")["passed"].mean() * 100
    print("\nOverall Pass Rates by Model (goose-cli):")
    for model in ['gpt-4o', 'gpt-4o-mini', 'gpt-5']:
        if model in overall_model_rates.index:
            print(f"   â€¢ {model:12s}: {overall_model_rates[model]:5.1f}%")
    
    if len(overall_model_rates) > 1:
        model_diff = overall_model_rates.max() - overall_model_rates.min()
        best_model = overall_model_rates.idxmax()
        print(f"\n   â†’ Model choice affects performance ({model_diff:.1f} pp difference)")
        print(f"   â†’ Best performing model: {best_model}")

print("\n" + "=" * 80)

## Next Steps

Based on these findings:

1. **Agent selection matters**: Claude Code shows significantly better MCP retrieval performance
2. **Category-specific analysis**: Identify why certain evaluation types are more challenging
3. **Model optimization**: Investigate if newer/larger models improve performance consistently
4. **Integration with manuscript**: Export these three figures for publication

See `notes/experiment_1_cross_agent_comparison.md` for detailed experimental design and methodology.