# Metrics Aggregation by Experiment
This notebook aggregates metrics from all_metrics.csv by experiment name,
showing mean and standard deviation for coverage, mutation score, and tokens used.

In [None]:
import sys

sys.path.append("src")

from utils.plot_metrics import plot_metrics

# Execute plot_metrics to show individual run metrics
plot_metrics()

In [2]:
import pandas as pd
import numpy as np

In [None]:
# Load the metrics data
df = pd.read_csv("all_metrics.csv")
print(f"Loaded {len(df)} rows from all_metrics.csv")
print(f"Columns: {df.columns.tolist()}")

In [4]:
# Aggregate metrics by experiment_name
# Focus on: coverage_percent, mutation_score_percent, total_tokens

aggregated = (
    df.groupby("experiment_name")
    .agg(
        {
            "coverage_percent": ["mean", "std"],
            "mutation_score_percent": ["mean", "std"],
            "total_tokens": ["mean", "std"],
        }
    )
    .round(2)
)

# Flatten column names
aggregated.columns = ["_".join(col).strip() for col in aggregated.columns.values]
aggregated = aggregated.reset_index()

In [None]:
# Create a summary table for reference
result_df = pd.DataFrame()
result_df["Experiment"] = aggregated["experiment_name"]
result_df["Coverage Mean"] = aggregated["coverage_percent_mean"]
result_df["Coverage Std"] = aggregated["coverage_percent_std"]
result_df["Mutation Mean"] = aggregated["mutation_score_percent_mean"]
result_df["Mutation Std"] = aggregated["mutation_score_percent_std"]
result_df["Tokens Mean"] = aggregated["total_tokens_mean"].astype(int)
result_df["Tokens Std"] = aggregated["total_tokens_std"].round(0).astype(int)

result_df

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Single grouped chart for all metrics
fig, ax = plt.subplots(figsize=(12, 6))

experiments = aggregated["experiment_name"]
x = np.arange(len(experiments))
width = 0.28

# Normalize tokens to percentage scale to fit with coverage/mutation
tokens_norm = (
    aggregated["total_tokens_mean"] / aggregated["total_tokens_mean"].max()
) * 100

bars1 = ax.bar(
    x - width,
    aggregated["coverage_percent_mean"],
    width,
    label="Coverage (%)",
    color="steelblue",
)
bars2 = ax.bar(
    x,
    aggregated["mutation_score_percent_mean"],
    width,
    label="Mutation Score (%)",
    color="coral",
)
bars3 = ax.bar(
    x + width, tokens_norm, width, label="Tokens (normalized %)", color="seagreen"
)

ax.set_title("Metrics per Experiment (single chart)")
ax.set_ylabel("Value")
ax.set_xticks(x)
ax.set_xticklabels(experiments, rotation=45, ha="right")
ax.legend()
ax.grid(axis="y", alpha=0.3)

plt.tight_layout()

plt.show()