In [8]:
from pathlib import Path
import pandas as pd
import numpy as np

# Where all your inference folders live
INFER_BASE = Path("/home/hep/an1522/dark_tridents_wspace/outputs/inference")

# Where you want the occlusion selection CSVs to go
OUT_BASE = INFER_BASE / "_occlusion_selections"
OUT_BASE.mkdir(parents=True, exist_ok=True)

# Your expected columns in *_scores.csv
KEYS = ["run_number", "subrun_number", "event_number"]
NEEDED = KEYS + ["signal_score", "entry_number", "n_pixels"]

# Sentinel / invalid values observed in your tables
BAD_SCORE = -999999.9
BAD_PIXELS = -1

# n_pixels minimum cut (from your distribution reasoning)
# NPX_MIN = {
#     "samples": 0,
#     "signal": 0,
# }

In [9]:
def list_inference_folders(base: Path):
    # only real folders, ignore helper directories like run*_pdf/run*_png
    folders = []
    for p in base.iterdir():
        if not p.is_dir():
            continue
        name = p.name
        if name.endswith("_pdf") or name.endswith("_png"):
            continue
        if name.startswith("_"):
            continue
        if name.endswith("_resnet18_bn"):
            continue
        if name.endswith("_resnet18_gn"):
            continue
        if name.endswith("_resnet34_bn"):
            continue
        folders.append(p)
    return sorted(folders)

def read_scores_in_folder(folder: Path):
    # read all *_scores.csv in that folder
    files = sorted(folder.glob("*_scores.csv"))
    if not files:
        return None
    
    dfs = []
    for f in files:
        df = pd.read_csv(f)
        missing = [c for c in NEEDED if c not in df.columns]
        if missing:
            # skip files that aren't in the expected format
            continue
        df = df[NEEDED].copy()
        df["__file"] = f.name
        df["__folder"] = folder.name
        dfs.append(df)
    if not dfs:
        return None
    return pd.concat(dfs, ignore_index=True)

folders = list_inference_folders(INFER_BASE)
print("Found folders:", len(folders))
# print("Folder names:", folders)

all_dfs = []
for folder in folders:
    df = read_scores_in_folder(folder)
    if df is not None:
        all_dfs.append(df)

all_scores = pd.concat(all_dfs, ignore_index=True)
print("Total rows loaded:", len(all_scores))
print("Folders loaded:", all_scores["__folder"].nunique())

Found folders: 8
Total rows loaded: 1050708
Folders loaded: 8


In [10]:
def dataset_type_from_folder(folder_name: str):
    # Expecting folder names like:
    # run1_samples, run1_samples_resnet18_bn, run1_signal_resnet34_gn, etc.
    # We'll classify into (run, kind) where kind in {"samples","signal"}
    parts = folder_name.split("_")
    run = parts[0]  # "run1" or "run3"
    # find whether it contains "samples" or "signal"
    kind = "samples" if "samples" in parts else ("signal" if "signal" in parts else None)
    return run, kind

all_scores[["__run", "__kind"]] = all_scores["__folder"].apply(
    lambda s: pd.Series(dataset_type_from_folder(s))
)

# Keep only run1/run3 samples/signal
all_scores = all_scores.dropna(subset=["__run", "__kind"]).copy()

# Clean invalid sentinel rows
mask_good = (
    (all_scores["signal_score"] != BAD_SCORE) &
    (all_scores["n_pixels"] != BAD_PIXELS) &
    (all_scores["n_pixels"] >= 0)
)
clean = all_scores.loc[mask_good].copy()

SIGNAL_KEEP = "dt_ratio_0.6_ma_0.05_pi0"

# Keep:
# - ALL samples rows
# - ONLY signal rows whose __file contains SIGNAL_KEEP
clean = clean[
    (clean["__kind"] == "samples") |
    ((clean["__kind"] == "signal") & clean["__file"].str.contains(SIGNAL_KEEP, na=False))
].copy()

print("After signal-file filter:", len(clean))
print(clean.groupby(["__run","__kind"])["__folder"].nunique())
print("Samples files now present (unique):", clean.loc[clean["__kind"]=="samples","__file"].nunique())
print(clean.loc[clean["__kind"]=="samples", "__file"].drop_duplicates().head(10))
print("Signal files now present (unique):", clean.loc[clean["__kind"]=="signal","__file"].nunique())
print(clean.loc[clean["__kind"]=="signal", "__file"].drop_duplicates().head(10))

After signal-file filter: 133264
__run  __kind 
run1   samples    2
       signal     2
run3   samples    2
       signal     2
Name: __folder, dtype: int64
Samples files now present (unique): 6
0                run1_NuMI_dirt_larcv_cropped_scores.csv
3331       run1_NuMI_nu_overlay_larcv_cropped_scores.csv
17101     run1_offbeam_larcv_cropped_full_set_scores.csv
459464                run3_dirt_larcv_cropped_scores.csv
461877          run3_nu_overlay_larcv_cropped_scores.csv
473403             run3_offbeam_larcv_cropped_scores.csv
Name: __file, dtype: object
Signal files now present (unique): 2
81802     run1_dt_ratio_0.6_ma_0.05_pi0_larcv_cropped_sc...
576111    run3_dt_ratio_0.6_ma_0.05_pi0_larcv_cropped_sc...
Name: __file, dtype: object


In [11]:
def scores_to_root(scores_file: str, root_base_dir: str):
    """
    scores_file: e.g. 'run1_dt_ratio_0.6_ma_0.05_pi0_larcv_cropped_scores.csv'
    returns: '/vols/.../run1_signal/run1_dt_ratio_0.6_ma_0.05_pi0_larcv_cropped.root'
    """
    stem = Path(scores_file).name.replace("_scores.csv", "")
    return str(Path(root_base_dir) / f"{stem}.root")
    
def select_occlusion_set(df_folder, kind,
                         A=10, B=10, C=10, D=5,
                         # tail selection fractions
                         samples_high_q=0.995,     # top 0.5% for samples (outliers)
                         signal_low_q=0.005,       # bottom 0.5% for signal (failures)
                         # borderline target score (usually 0.5 for sigmoid)
                         borderline_target=0.5,
                         border_frac=0.01,         # closest 1% to target
                         # weird: within chosen tail, pick lowest 10% n_pixels
                         weird_lowpix_q=0.10):
    """
    kind: "samples" or "signal"
    Returns a deduped set of events to occlude for ONE folder.
    """
    df = df_folder.copy()

    # # Require sensible pixels
    # df = df[df["n_pixels"].ge(NPX_MIN[kind])].copy()
    # if df.empty:
    #     return df

    # A) Tail events (different for samples vs signal)
    if kind == "samples":
        q = df["signal_score"].quantile(samples_high_q)
        A_df = df[df["signal_score"] >= q].sort_values("signal_score", ascending=False).head(A)
    else:  # signal
        q = df["signal_score"].quantile(signal_low_q)
        A_df = df[df["signal_score"] <= q].sort_values("signal_score", ascending=True).head(A)

    # B) Borderline: closest to 0.5 (or whatever you set)
    df2 = df.assign(__dist=(df["signal_score"] - borderline_target).abs())
    k = max(1, int(border_frac * len(df2)))
    B_df = df2.sort_values("__dist", ascending=True).head(max(B, k)).drop(columns="__dist").head(B)

    # C) Weird: within the tail from A_df, pick those with low pixels
    if len(A_df) > 0:
        pix_cut = A_df["n_pixels"].quantile(weird_lowpix_q)
        C_df = A_df[A_df["n_pixels"] <= pix_cut].sort_values(
            ["n_pixels", "signal_score"],
            ascending=[True, (kind == "signal")]  # signal tail is low scores
        ).head(C)
    else:
        C_df = df.iloc[0:0].copy()

    # D) Controls:
    # samples: low score but high pixels (busy background confidently background)
    # signal: high score but high pixels (easy signal)
    pix_hi = df["n_pixels"].quantile(0.75)
    busy = df[df["n_pixels"] >= pix_hi].copy()
    if kind == "samples":
        D_df = busy.sort_values("signal_score", ascending=True).head(D)
    else:
        D_df = busy.sort_values("signal_score", ascending=False).head(D)

    # Combine + dedupe
    pick = pd.concat([A_df, B_df, C_df, D_df], ignore_index=True)
    pick = pick.drop_duplicates(subset=KEYS, keep="first")

    # Label reasons (simple + consistent)
    def reason(row):
        tags = []
        if kind == "samples":
            if row["signal_score"] >= df["signal_score"].quantile(samples_high_q):
                tags.append("A_high_tail")
        else:
            if row["signal_score"] <= df["signal_score"].quantile(signal_low_q):
                tags.append("A_low_tail")

        # borderline tag
        if abs(row["signal_score"] - borderline_target) <= (df["signal_score"] - borderline_target).abs().quantile(border_frac):
            tags.append("B_border_0p5")

        if len(A_df) and (row["n_pixels"] <= A_df["n_pixels"].quantile(weird_lowpix_q)):
            tags.append("C_weird_lowpix")

        if row["n_pixels"] >= pix_hi:
            tags.append("D_busy")

        return "+".join(sorted(set(tags))) if tags else "picked"

    pick["pick_reason"] = pick.apply(reason, axis=1)

    # Make sure these metadata columns are present in the result
    for col in ["__file", "__folder", "__run", "__kind"]:
        if col not in pick.columns and col in df_folder.columns:
            pick[col] = df_folder[col].iloc[0]

    return pick

def select_occlusion_set_stratified(df_folder, kind: str,
                                   files=None,
                                   per_file_counts=(4, 4, 4, 2),  # A,B,C,D per file
                                   **kwargs):
    """
    For samples: enforce representation by selecting A/B/C/D per __file group.
    For signal: usually files will be length-1 anyway after filtering.
    """
    A, B, C, D = per_file_counts

    df = df_folder.copy()
    if files is not None:
        df = df[df["__file"].isin(files)].copy()

    if df.empty:
        return df

    picks = []
    for f, g in df.groupby("__file"):
        p = select_occlusion_set(g, kind=kind, A=A, B=B, C=C, D=D, **kwargs)
        if p is not None and not p.empty:
            picks.append(p)

    if not picks:
        return df.iloc[0:0].copy()

    out = pd.concat(picks, ignore_index=True)
    out = out.drop_duplicates(subset=KEYS, keep="first")
    return out

In [12]:
def add_disagreement_picks(
    picks_ref: pd.DataFrame,
    run: str,
    kind: str,
    other_folder: str,
    E_abs: int = 10,
    E_flip: int = 10,
    thr: float = 0.5,
    margin: float = 0.20,
):
    """
    Add 'model disagreement' events between reference folder (picks_ref comes from ref folder)
    and another folder (e.g. run1_samples_resnet34_gn).

    Returns: (picks_plus, extra_debug_df)
    - picks_plus is picks_ref with extra rows appended (deduped) and pick_reason tagged.
    - extra_debug_df contains the disagreement candidates and their deltas (for inspection).
    """
    # Determine reference folder name from picks_ref metadata
    if "__folder" not in picks_ref.columns:
        raise ValueError("picks_ref must contain __folder column (you already propagate it).")
    ref_folder = picks_ref["__folder"].iloc[0]

    # All rows for ref + other folder
    subset = clean[(clean["__run"] == run) & (clean["__kind"] == kind)].copy()

    df_ref   = subset[subset["__folder"] == ref_folder].copy()
    df_other = subset[subset["__folder"] == other_folder].copy()
    if df_other.empty or df_ref.empty:
        print(f"[warn] disagreement: missing data for {ref_folder} or {other_folder}")
        return picks_ref, pd.DataFrame()

    # Minimal columns
    df_ref   = df_ref[KEYS + ["signal_score", "n_pixels", "__file", "__folder", "entry_number"]].copy()
    df_other = df_other[KEYS + ["signal_score"]].copy()

    df_ref = df_ref.rename(columns={"signal_score": "score_ref"})
    df_other = df_other.rename(columns={"signal_score": "score_other"})

    m = df_ref.merge(df_other, on=KEYS, how="inner")
    if m.empty:
        print("[warn] disagreement: no overlap on KEYS")
        return picks_ref, pd.DataFrame()

    # disagreement metrics
    m["delta"] = m["score_other"] - m["score_ref"]
    m["abs_delta"] = m["delta"].abs()

    # Strong flip: requires being confidently on opposite sides
    m["flip_strong"] = (
        ((m["score_ref"]   >= thr + margin) & (m["score_other"] <= thr - margin)) |
        ((m["score_other"] >= thr + margin) & (m["score_ref"]   <= thr - margin))
    )

    # Pick top |delta| + top strong flips
    top_abs  = m.sort_values("abs_delta", ascending=False).head(E_abs)
    top_flip = m[m["flip_strong"]].sort_values("abs_delta", ascending=False).head(E_flip)

    extra = pd.concat([top_abs, top_flip], ignore_index=True)
    extra = extra.drop_duplicates(subset=KEYS, keep="first")

    # Convert back into "event rows" (needs same schema as your picks)
    extra_rows = subset.merge(extra[KEYS], on=KEYS, how="inner")

    # Tag reasons (append to any existing pick_reason if present)
    def mk_reason(row):
        # find the disagreement row for this event
        rr = extra.loc[
            (extra["run_number"] == row["run_number"]) &
            (extra["subrun_number"] == row["subrun_number"]) &
            (extra["event_number"] == row["event_number"])
        ].iloc[0]
        tags = ["E_disagree", f"E_vs_{other_folder}"]
        if rr["flip_strong"]:
            tags.append(f"E_strong_flip_m{margin}")
        # which model higher
        tags.append("E_other_gt_ref" if rr["delta"] > 0 else "E_ref_gt_other")
        return "+".join(tags)

    if "pick_reason" not in extra_rows.columns:
        extra_rows["pick_reason"] = ""

    extra_rows["pick_reason"] = extra_rows.apply(mk_reason, axis=1)

    # Combine + dedupe with original picks_ref
    out = pd.concat([picks_ref, extra_rows], ignore_index=True)
    out = out.drop_duplicates(subset=KEYS, keep="first")

    return out, m  # m is nice to inspect

In [13]:
RUN1_SAMPLES_FILES = [
    "run1_NuMI_dirt_larcv_cropped_scores.csv",
    "run1_NuMI_nu_overlay_larcv_cropped_scores.csv",
    "run1_offbeam_larcv_cropped_full_set_scores.csv",
]

def make_master_for(run: str, kind: str, reference_folder: str = None):
    subset = clean[(clean["__run"] == run) & (clean["__kind"] == kind)].copy()
    folders = sorted(subset["__folder"].unique().tolist())
    if not folders:
        print(f"[warn] no data for {run}_{kind}")
        return None

    ref = reference_folder if (reference_folder in folders) else folders[0]
    ref_clean = ref.replace(f"{run}_{kind}_", "")
    df_ref = subset[subset["__folder"] == ref].copy()

    # ---- Choose selection strategy ----
    if kind == "samples":
        # pick a few per file so you always include dirt + nu_overlay + offbeam
        files = RUN1_SAMPLES_FILES if run == "run1" else None  # define RUN3 similarly if you want
        picks = select_occlusion_set_stratified(
            df_ref, kind="samples",
            files=files,
            per_file_counts=(4,4,4,2),   # totals: 3 files * (4+4+4+2)=42 max, then dedupe
        )
    else:
        # signal already filtered to ONE file via SIG_KEEP
        picks = select_occlusion_set(df_ref, kind="signal")

    if picks is None or picks.empty:
        print(f"[warn] no picks for {run}_{kind} (ref={ref})")
        return None

    # --- Add disagreement events: MPID (ref) vs ResNet34_GN ---
    if kind in ["samples", "signal"]:
        other = f"{run}_{kind}_resnet34_gn"   # e.g. run1_samples_resnet34_gn
        picks, disagree_debug = add_disagreement_picks(
            picks_ref=picks,
            run=run,
            kind=kind,
            other_folder=other,
            E_abs=10,
            E_flip=10,
            margin=0.20,
        )

    # ---- Build master ----
    keep = ["pick_reason", "entry_number", "__file"]
    master = picks[keep].copy()
    
    # Merge signal scores from all folders
    for folder in folders:
        folder_clean = folder.replace(f"{run}_{kind}_", "")
        df_folder = subset[subset["__folder"] == folder].copy()
        
        # Merge on entry_number AND __file to match the right rows
        merge_on = ["entry_number", "__file"]
        df_folder = df_folder[merge_on + ["signal_score"]].rename(
            columns={"signal_score": f"{folder_clean}__signal_score"}
        )
        master = master.merge(df_folder, on=merge_on, how="left")

    master["n_pixels"] = picks["n_pixels"].copy()
    master = master.rename(columns={"__file": "scores_file"})
    scores_file_col = master.pop("scores_file")
    master["scores_file"] = scores_file_col
    master["reference_folder"] = ref_clean
    master["dataset"] = f"{run}_{kind}"
    master[KEYS] = picks[KEYS].copy()
    return master

masters = {}
for run in ["run1","run3"]:
    for kind in ["samples","signal"]:
        m = make_master_for(run, kind)
        if m is not None:
            masters[f"{run}_{kind}"] = m
            out = OUT_BASE / f"master__{run}_{kind}.csv"
            m.to_csv(out, index=False)
            print("Wrote:", out, "rows=", len(m))


Wrote: /home/hep/an1522/dark_tridents_wspace/outputs/inference/_occlusion_selections/master__run1_samples.csv rows= 40
Wrote: /home/hep/an1522/dark_tridents_wspace/outputs/inference/_occlusion_selections/master__run1_signal.csv rows= 35
Wrote: /home/hep/an1522/dark_tridents_wspace/outputs/inference/_occlusion_selections/master__run3_samples.csv rows= 39
Wrote: /home/hep/an1522/dark_tridents_wspace/outputs/inference/_occlusion_selections/master__run3_signal.csv rows= 35


In [14]:
ROOT_BASE = "/vols/sbn/uboone/darkTridents/data/larcv_files"

def root_dir_for(run: str, kind: str):
    return str(Path(ROOT_BASE) / f"{run}_{kind}")
    
def write_to_occlude_files(master: pd.DataFrame):
    dataset = master["dataset"].iloc[0]
    run, kind = dataset.split("_", 1)

    subset = clean[(clean["__run"] == run) & (clean["__kind"] == kind)].copy()
    folders = sorted(subset["__folder"].unique().tolist())

    report_rows = []
    for folder in folders:
        df_f = subset[subset["__folder"] == folder].copy()

        lookup = df_f[KEYS + ["entry_number", "signal_score", "n_pixels", "__file"]].copy()
        lookup = lookup.rename(columns={"__file": "scores_file"})
        lookup = lookup.drop_duplicates(subset=KEYS, keep="first")

        # Add suffixes to handle duplicate columns
        merged = master.merge(lookup, on=KEYS, how="left", suffixes=("_ref", "_folder"))

        # Use folder-specific values
        matched = merged["entry_number_folder"].notna().sum()
        total = len(merged)

        to_occ = merged.dropna(subset=["entry_number_folder"]).copy()
        to_occ["entry_number"] = to_occ["entry_number_folder"].astype(int)
        to_occ["scores_file"] = to_occ["scores_file_folder"]

        # Add root_file
        root_dir = root_dir_for(run, kind)
        to_occ["root_file"] = to_occ["scores_file"].apply(lambda s: scores_to_root(s, root_dir))

        # Clean folder name
        folder_clean = folder.replace(f"{run}_{kind}_", "")
        
        # Select final columns to save
        keep_cols = (KEYS + ["pick_reason", "entry_number"] + 
                    [col for col in to_occ.columns if "__signal_score" in col] +
                    ["n_pixels_folder", "scores_file", "root_file", "reference_folder", "dataset"])
        
        to_occ = to_occ[keep_cols].rename(columns={"n_pixels_folder": "n_pixels"})
        
        out = OUT_BASE / f"to_occlude__{dataset}__{folder_clean}.csv"
        to_occ.to_csv(out, index=False)

        report_rows.append({
            "dataset": dataset,
            "folder": folder_clean,
            "total_master_events": total,
            "matched_events": int(matched),
            "wrote_csv": str(out),
        })

    return pd.DataFrame(report_rows)


reports = []
for name, master in masters.items():
    rep = write_to_occlude_files(master)
    reports.append(rep)

report = pd.concat(reports, ignore_index=True)
report_path = OUT_BASE / "matching_report.csv"
report.to_csv(report_path, index=False)

print("Wrote report:", report_path)
report


Wrote report: /home/hep/an1522/dark_tridents_wspace/outputs/inference/_occlusion_selections/matching_report.csv


Unnamed: 0,dataset,folder,total_master_events,matched_events,wrote_csv
0,run1_samples,mpid,40,40,/home/hep/an1522/dark_tridents_wspace/outputs/...
1,run1_samples,resnet34_gn,40,40,/home/hep/an1522/dark_tridents_wspace/outputs/...
2,run1_signal,mpid,35,35,/home/hep/an1522/dark_tridents_wspace/outputs/...
3,run1_signal,resnet34_gn,35,35,/home/hep/an1522/dark_tridents_wspace/outputs/...
4,run3_samples,mpid,39,39,/home/hep/an1522/dark_tridents_wspace/outputs/...
5,run3_samples,resnet34_gn,39,39,/home/hep/an1522/dark_tridents_wspace/outputs/...
6,run3_signal,mpid,35,35,/home/hep/an1522/dark_tridents_wspace/outputs/...
7,run3_signal,resnet34_gn,35,35,/home/hep/an1522/dark_tridents_wspace/outputs/...


In [16]:
def query_events(folder_name: str, score_min=None, score_max=None, npx_min=None, npx_max=None, top=20, sort_by="signal_score"):
    df = clean[clean["__folder"] == folder_name].copy()
    if df.empty:
        print("No rows for folder:", folder_name)
        return df

    if score_min is not None: df = df[df["signal_score"] >= score_min]
    if score_max is not None: df = df[df["signal_score"] <= score_max]
    if npx_min is not None:   df = df[df["n_pixels"] >= npx_min]
    if npx_max is not None:   df = df[df["n_pixels"] <= npx_max]

    if df.empty:
        print("No matches.")
        return df

    df = df.sort_values(sort_by, ascending=False if sort_by=="signal_score" else True)
    return df[KEYS + ["signal_score","n_pixels","entry_number","__file"]].head(top)

# Example:
# query_events("run1_samples_resnet34_gn", score_min=0.5, npx_min=500, top=10)
