In [3]:
# # Extended Analysis: Comparing Original vs. Deduplicated References


import os
import glob
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

In [7]:
import json
import pandas as pd


def parse_all_experiments_results(json_path: str) -> pd.DataFrame:
    """
    Reads the single all_experiments_results.json file, which is a list of dicts:
    
    [
      {
        "method": ...,
        "align_threshold": ...,
        "claim_gen_key": ...,
        "dedup_threshold": ...,
        "dedup_strategy": ...,
        "dataset_name": ...,
        "records": [
          { "record_id": ..., "coverage": ..., "atomicity": ... },
          ...
        ]
      },
      ...
    ]
    
    Returns a DataFrame with columns:
    [
      "method", "align_threshold", "claim_gen_key",
      "dedup_threshold", "dedup_strategy", "dataset_name",
      "record_id", "coverage", "atomicity"
    ]
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)  # data is a list of experiment configs

    rows = []
    for exp_conf in data:
        method = exp_conf.get("method")
        align_threshold = exp_conf.get("align_threshold")
        claim_gen_key = exp_conf.get("claim_gen_key")
        dedup_threshold = exp_conf.get("dedup_threshold")
        dedup_strategy = exp_conf.get("dedup_strategy")
        dataset_name = exp_conf.get("dataset_name")

        # exp_conf["records"] is a list of { "record_id", "coverage", "atomicity" }
        for rec in exp_conf["records"]:
            row = {
                "method": method,
                "align_threshold": align_threshold,
                "claim_gen_key": claim_gen_key,
                "dedup_threshold": dedup_threshold,
                "dedup_strategy": dedup_strategy,
                "dataset_name": dataset_name,
                "record_id": rec.get("record_id"),
                "coverage": rec.get("coverage"),
                "atomicity": rec.get("atomicity")
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    return df


# Example usage:
df_all = parse_all_experiments_results("all_experiments_results.json")
df_all.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'all_experiments_results.json'

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# Group to get average coverage & atomicity
group_cols = ["method", "align_threshold", "claim_gen_key", "dedup_threshold", "dedup_strategy", "dataset_name"]
df_summary = (
    df_all.groupby(group_cols, as_index=False)
    .agg({
        "coverage": "mean",
        "atomicity": "mean"
    })
)

df_summary.head(15)

KeyError: 'method'

In [9]:
df_summary["is_deduped"] = df_summary["dedup_threshold"].notna()  # True if dedup_threshold is not None

sns.barplot(
    data=df_summary,
    x="method",
    y="coverage",
    hue="is_deduped"
)

plt.title("Coverage: Original vs. Deduplicated")
plt.xlabel("Alignment Method")
plt.ylabel("Mean Coverage")
plt.ylim(0, 1)
plt.legend(title="Deduped?")
plt.show()

KeyError: 'dedup_threshold'

In [10]:
df_summary["is_deduped"] = df_summary["dedup_threshold"].notna()  # True if dedup_threshold is not None

sns.barplot(
    data=df_summary,
    x="method",
    y="coverage",
    hue="is_deduped"
)

plt.title("Coverage: Original vs. Deduplicated")
plt.xlabel("Alignment Method")
plt.ylabel("Mean Coverage")
plt.ylim(0, 1)
plt.legend(title="Deduped?")
plt.show()


KeyError: 'dedup_threshold'

In [11]:
import matplotlib.ticker as mticker

plt.figure(figsize=(10, 6))

sns.lineplot(
    data=df_summary,
    x="align_threshold",
    y="coverage",
    hue="method",
    style="claim_gen_key",
    markers=True,
    dashes=False
)
plt.title("Coverage vs. Alignment Threshold")
plt.xlabel("Threshold")
plt.ylabel("Coverage")
plt.gca().xaxis.set_major_locator(mticker.MaxNLocator(integer=True))
plt.ylim(0, 1.0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


ValueError: Could not interpret value `align_threshold` for `x`. An entry with this name does not appear in `data`.

<Figure size 1000x600 with 0 Axes>

In [12]:
df_dedup_only = df_summary[df_summary["dedup_threshold"].notna()]

sns.lineplot(
    data=df_dedup_only,
    x="dedup_threshold",
    y="atomicity",
    hue="method",
    style="claim_gen_key",
    markers=True
)
plt.title("Atomicity vs. BFS Dedup Threshold")
plt.xlabel("Dedup Threshold")
plt.ylabel("Mean Atomicity")
plt.show()


KeyError: 'dedup_threshold'