# Bombcell Post-Run Analysis (Open Ephys + Kilosort4)

Assumes Bombcell has already been run and you exported per-probe CSV/JSON summaries.

**Expected folder convention**
- `{NP_recording_name}/bombcell_DEFAULT/`
  - `DUPLICATED_KILOSORT4_FILES/`
  - `batch_DEFAULT_results/`
- `{NP_recording_name}/bombcell_NP2.0/`
  - `DUPLICATED_KILOSORT4_FILES_ACD/`
  - `NP2_ReRun_results/`

In [1]:
NP_recording_name = 'Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00'

In [2]:
# New Code
from pathlib import Path

RECORDING_ROOT = Path(r"H:\Grant\Neuropixels\Kilosort_Recordings") / NP_recording_name

BOMBCELL_DEFAULT_ROOT = RECORDING_ROOT / 'bombcell' / "bombcell_DEFAULT"
BOMBCELL_NP20_ROOT = RECORDING_ROOT / 'bombcell'  / "bombcell_NP2.0"
BOMBCELL_SINGLEPROBE_ROOT = RECORDING_ROOT / 'bombcell'  / "bombcell_single_probe"


DEFAULT_KS_STAGING_ROOT = BOMBCELL_DEFAULT_ROOT 
NP20_KS_STAGING_ROOT = BOMBCELL_NP20_ROOT 
BOMBCELL_KS_SINGLEPROBE_STAGING_ROOT = BOMBCELL_SINGLEPROBE_ROOT 

DEFAULT_EXPORT_ROOT = BOMBCELL_DEFAULT_ROOT / "batch_DEFAULT_results"
NP20_EXPORT_ROOT = BOMBCELL_NP20_ROOT / "NP2_ReRun_results"
SINGLE_EXPORT_ROOT = BOMBCELL_SINGLEPROBE_ROOT / "single_probe_results"

# Make sure they exist
for p in [
    DEFAULT_KS_STAGING_ROOT,
    NP20_KS_STAGING_ROOT,
    BOMBCELL_KS_SINGLEPROBE_STAGING_ROOT,
    DEFAULT_EXPORT_ROOT,
    NP20_EXPORT_ROOT,
    SINGLE_EXPORT_ROOT

]:
    p.mkdir(parents=True, exist_ok=True)
    print('ALL Paths Exist')


ALL Paths Exist
ALL Paths Exist
ALL Paths Exist
ALL Paths Exist
ALL Paths Exist
ALL Paths Exist


In [3]:
# =========================
# Configure
# =========================
from pathlib import Path
import pandas as pd
import numpy as np
import json

PROBES_ALL  = ["A","B","C","D","E","F"]
PROBES_NP20 = ["A","C","D"]

print("RECORDING_ROOT:", RECORDING_ROOT)
print("DEFAULT_EXPORT_ROOT exists:", DEFAULT_EXPORT_ROOT.exists())
print(DEFAULT_EXPORT_ROOT)
print("NP20_EXPORT_ROOT exists:", NP20_EXPORT_ROOT.exists())
print(NP20_EXPORT_ROOT)
print("SINGLE_EXPORT_ROOT exists:", SINGLE_EXPORT_ROOT.exists())
print(SINGLE_EXPORT_ROOT)

RECORDING_ROOT: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00
DEFAULT_EXPORT_ROOT exists: True
H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_DEFAULT\batch_DEFAULT_results
NP20_EXPORT_ROOT exists: True
H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_NP2.0\NP2_ReRun_results
SINGLE_EXPORT_ROOT exists: True
H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\single_probe_results


In [4]:
# =========================
# Helpers
# =========================
def load_probe_exports(export_root: Path, probe: str):
    # Loads Probe_{probe} exports: quality_metrics.csv, unit_type_counts.csv, param.json, checks.json.
    probe_dir = export_root / f"Probe_{probe}"
    qm_path = probe_dir / f"Probe_{probe}_quality_metrics.csv"
    counts_path = probe_dir / f"Probe_{probe}_unit_type_counts.csv"
    param_path = probe_dir / f"Probe_{probe}_param.json"
    checks_path = probe_dir / f"Probe_{probe}_checks.json"
    err_path = probe_dir / "ERROR.txt"

    if err_path.exists():
        return {"probe": probe, "status": "FAILED", "error": err_path.read_text(), "probe_dir": probe_dir}

    out = {"probe": probe, "status": "OK", "probe_dir": probe_dir}
    out["qm"] = pd.read_csv(qm_path) if qm_path.exists() else None
    out["counts"] = pd.read_csv(counts_path) if counts_path.exists() else None
    out["param"] = json.loads(param_path.read_text()) if param_path.exists() else {}
    out["checks"] = json.loads(checks_path.read_text()) if checks_path.exists() else {}

    out["cluster_id_col"] = None
    if out["qm"] is not None:
        for c in ["cluster_id","clusterID","cluster_id_ks","cluster_id_phy","cluster"]:
            if c in out["qm"].columns:
                out["cluster_id_col"] = c
                break

    return out

def load_batch_summary(export_root: Path):
    p = export_root / "batch_summary.csv"
    return pd.read_csv(p) if p.exists() else None

def summarize_unit_types(qm: pd.DataFrame, label_col="Bombcell_unit_type"):
    if qm is None or label_col not in qm.columns:
        return None
    return qm[label_col].value_counts().rename_axis("unit_type").reset_index(name="count")

def add_percentages(df_counts: pd.DataFrame):
    if df_counts is None or df_counts.empty:
        return df_counts
    total = df_counts["count"].sum()
    df_counts = df_counts.copy()
    df_counts["pct"] = 100 * df_counts["count"] / total
    return df_counts

def find_cluster_row(qm: pd.DataFrame, cluster_id: int, cluster_id_col: str):
    if qm is None:
        raise ValueError("qm is None")
    if cluster_id_col is None or cluster_id_col not in qm.columns:
        raise ValueError("No cluster_id column found in quality_metrics.csv")
    sub = qm.loc[qm[cluster_id_col] == cluster_id]
    if sub.empty:
        raise KeyError(f"Cluster id {cluster_id} not found in {cluster_id_col}")
    return sub.iloc[0]

def threshold_fail_report(row, qm_cols, param):
    # Common Bombcell gates; only checks metrics that exist in the CSV.
    rules = [
        ("rawAmplitude", "<", param.get("minAmplitude", 40)),
        ("signalToNoiseRatio", "<", param.get("minSNR", 5)),
        ("presenceRatio", "<", param.get("minPresenceRatio", 0.7)),
        ("fractionRPVs_estimatedTauR", ">", param.get("maxRPVviolations", 0.1)),
        ("percentageSpikesMissing_gaussian", ">", param.get("maxPercSpikesMissing", 20)),
        ("waveformDuration_peakTrough", "<", param.get("minWvDuration", 100)),
        ("waveformDuration_peakTrough", ">", param.get("maxWvDuration", 1150)),
        ("nPeaks", ">", param.get("maxNPeaks", 2)),
        ("nTroughs", ">", param.get("maxNTroughs", 1)),
        ("waveformBaselineFlatness", ">", param.get("maxWvBaselineFraction", 0.3)),
    ]
    fails = []
    for col, op, thr in rules:
        if col not in qm_cols:
            continue
        v = row[col]
        if pd.isna(v):
            continue
        if (op == "<" and v < thr) or (op == ">" and v > thr):
            fails.append((col, float(v), op, float(thr)))
    return fails

## Load DEFAULT exports (all probes)

In [5]:
default_summary = load_batch_summary(DEFAULT_EXPORT_ROOT)
default_summary

Unnamed: 0,probe,status,ks_dir,save_path,n_NOISE,n_MUA,n_NON-SOMA,n_GOOD,max_raw_metric_nan_frac
0,A,OK,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,392,109,51,2,0.012635
1,B,OK,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,247,764,286,19,0.00228
2,C,OK,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,154,135,30,36,0.014085
3,D,OK,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,93,540,109,78,0.002439
4,E,OK,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,100,394,251,119,0.008102
5,F,OK,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,95,273,210,113,0.007236


In [6]:
default_data = {p: load_probe_exports(DEFAULT_EXPORT_ROOT, p) for p in PROBES_ALL}

for p in PROBES_ALL:
    d = default_data[p]
    print("="*60, f"Probe {p} ({d['status']})")
    if d["status"] != "OK":
        print(d.get("error",""))
        continue
    counts = add_percentages(summarize_unit_types(d["qm"]))
    display(counts)



Unnamed: 0,unit_type,count,pct
0,NOISE,392,70.758123
1,MUA,109,19.67509
2,NON-SOMA,51,9.205776
3,GOOD,2,0.361011




Unnamed: 0,unit_type,count,pct
0,MUA,764,58.054711
1,NON-SOMA,286,21.732523
2,NOISE,247,18.768997
3,GOOD,19,1.443769




Unnamed: 0,unit_type,count,pct
0,NOISE,154,43.380282
1,MUA,135,38.028169
2,GOOD,36,10.140845
3,NON-SOMA,30,8.450704




Unnamed: 0,unit_type,count,pct
0,MUA,540,65.853659
1,NON-SOMA,109,13.292683
2,NOISE,93,11.341463
3,GOOD,78,9.512195




Unnamed: 0,unit_type,count,pct
0,MUA,394,45.601852
1,NON-SOMA,251,29.050926
2,GOOD,119,13.773148
3,NOISE,100,11.574074




Unnamed: 0,unit_type,count,pct
0,MUA,273,39.507959
1,NON-SOMA,210,30.390738
2,GOOD,113,16.353111
3,NOISE,95,13.748191


## Load NP2.0 rerun exports (A/C/D)

In [7]:
np20_summary = load_batch_summary(NP20_EXPORT_ROOT)
np20_summary

Unnamed: 0,probe,status,ks_dir,save_path,n_NOISE,n_MUA,n_NON-SOMA,n_GOOD,max_raw_metric_nan_frac,error
0,A,OK,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,297.0,178.0,72.0,7.0,0.012635,
1,C,OK,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,H:\Grant\Neuropixels\Kilosort_Recordings\Reach...,128.0,129.0,39.0,59.0,0.014085,
2,D,FAILED,,,,,,,,No quality_metrics found (unknown failure)


In [8]:
np20_data = {p: load_probe_exports(NP20_EXPORT_ROOT, p) for p in PROBES_NP20}

for p in PROBES_NP20:
    d = np20_data[p]
    print("="*60, f"Probe {p} ({d['status']})")
    if d["status"] != "OK":
        print(d.get("error",""))
        continue
    counts = add_percentages(summarize_unit_types(d["qm"]))
    display(counts)



Unnamed: 0,unit_type,count,pct
0,NOISE,297,53.610108
1,MUA,178,32.129964
2,NON-SOMA,72,12.99639
3,GOOD,7,1.263538




Unnamed: 0,unit_type,count,pct
0,MUA,129,36.338028
1,NOISE,128,36.056338
2,GOOD,59,16.619718
3,NON-SOMA,39,10.985915


No quality_metrics found (unknown failure)


## Load single probe data

In [9]:
single_summary = load_batch_summary(SINGLE_EXPORT_ROOT)
single_summary

print(SINGLE_EXPORT_ROOT)

H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\single_probe_results


In [10]:
single_probe_data = {p: load_probe_exports(SINGLE_EXPORT_ROOT, p) for p in PROBES_ALL}

for p in PROBES_ALL:
    d = single_probe_data[p]
    print("="*60, f"Probe {p} ({d['status']})")
    if d["status"] != "OK":
        print(d.get("error",""))
        continue
    counts = add_percentages(summarize_unit_types(d["qm"]))
    display(counts)



None



Unnamed: 0,unit_type,count,pct
0,MUA,747,56.762918
1,NON-SOMA,286,21.732523
2,NOISE,247,18.768997
3,GOOD,36,2.735562




None



None



None



None

## Compare DEFAULT vs Single rerun

In [13]:
rows = []
for p in PROBES_ALL:
    d0 = default_data.get(p, {})
    d1 = single_probe_data.get(p, {})
    if d0.get("status") != "OK" or d1.get("status") != "OK":
        continue

    c0 = summarize_unit_types(d0["qm"])
    c1 = summarize_unit_types(d1["qm"])

    def _get(ct, name):
        if ct is None: 
            return 0
        sub = ct.loc[ct["unit_type"] == name, "count"]
        return int(sub.iloc[0]) if len(sub) else 0

    rows.append({
        "probe": p,
        "DEFAULT_GOOD": _get(c0,"GOOD"),
        "RERUN_GOOD": _get(c1,"GOOD"),
        "DEFAULT_MUA": _get(c0,"MUA"),
        "RERUN_MUA": _get(c1,"MUA"),
        "DEFAULT_NOISE": _get(c0,"NOISE"),
        "RERUN_NOISE": _get(c1,"NOISE"),
        "DEFAULT_NON-SOMA": _get(c0,"NON-SOMA"),
        "RERUN_NON-SOMA": _get(c1,"NON-SOMA"),
        "DEFAULT_TOTAL": len(d0["qm"]),
        # "RERUN_TOTAL": len(d1["qm"]),
    })
pd.DataFrame(rows)

Unnamed: 0,probe,DEFAULT_GOOD,RERUN_GOOD,DEFAULT_MUA,RERUN_MUA,DEFAULT_NOISE,RERUN_NOISE,DEFAULT_NON-SOMA,RERUN_NON-SOMA,DEFAULT_TOTAL
0,A,2,0,109,0,392,0,51,0,554
1,B,19,36,764,747,247,247,286,286,1316
2,C,36,0,135,0,154,0,30,0,355
3,D,78,0,540,0,93,0,109,0,820
4,E,119,0,394,0,100,0,251,0,864
5,F,113,0,273,0,95,0,210,0,691


## Compare DEFAULT vs NP2.0 rerun (A/C/D)

In [11]:
rows = []
for p in PROBES_NP20:
    d0 = default_data.get(p, {})
    d1 = np20_data.get(p, {})
    if d0.get("status") != "OK" or d1.get("status") != "OK":
        continue

    c0 = summarize_unit_types(d0["qm"])
    c1 = summarize_unit_types(d1["qm"])

    def _get(ct, name):
        if ct is None: 
            return 0
        sub = ct.loc[ct["unit_type"] == name, "count"]
        return int(sub.iloc[0]) if len(sub) else 0

    rows.append({
        "probe": p,
        "DEFAULT_GOOD": _get(c0,"GOOD"),
        "RERUN_GOOD": _get(c1,"GOOD"),
        "DEFAULT_MUA": _get(c0,"MUA"),
        "RERUN_MUA": _get(c1,"MUA"),
        "DEFAULT_NOISE": _get(c0,"NOISE"),
        "RERUN_NOISE": _get(c1,"NOISE"),
        "DEFAULT_NON-SOMA": _get(c0,"NON-SOMA"),
        "RERUN_NON-SOMA": _get(c1,"NON-SOMA"),
        "DEFAULT_TOTAL": len(d0["qm"]),
        "RERUN_TOTAL": len(d1["qm"]),
    })
pd.DataFrame(rows)

Unnamed: 0,probe,DEFAULT_GOOD,RERUN_GOOD,DEFAULT_MUA,RERUN_MUA,DEFAULT_NOISE,RERUN_NOISE,DEFAULT_NON-SOMA,RERUN_NON-SOMA,DEFAULT_TOTAL,RERUN_TOTAL
0,A,2,7,109,178,392,297,51,72,554,554
1,C,36,59,135,129,154,128,30,39,355,355


## Check single units 

#### Options for BC_SESSION

In [None]:
BOMBCELL_DEFAULT_ROOT = RECORDING_ROOT / 'bombcell' / "bombcell_DEFAULT"
BOMBCELL_NP20_ROOT = RECORDING_ROOT / 'bombcell'  / "bombcell_NP2.0"
BOMBCELL_SINGLEPROBE_ROOT = RECORDING_ROOT / 'bombcell'  / "bombcell_single_probe"

#### Set Probe and BC session

In [None]:
probe_letter = 'B'
BC_SESSION = BOMBCELL_SINGLEPROBE_ROOT

In [None]:

ks_dir = Path(fr"{BC_SESSION}\kilosort4_{probe_letter}")

if ks_dir.exists():
    print(f'Found BC session for probe {probe_letter}')
    print(ks_dir)
else:
    print('No Dir Found')


Found BC session for probe B
H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B


#### Select single unit to check

In [None]:
from pathlib import Path
import pandas as pd

cluster_id = 39

label_tsv = ks_dir / "cluster_bc_unitType.tsv"
df = pd.read_csv(label_tsv, sep="\t")

# Column is usually 'cluster_id' or 'id'
print(df.columns)

# Try both common names:
if "cluster_id" in df.columns:S
    print(df.loc[df["cluster_id"] == cluster_id])
elif "id" in df.columns:
    print(df.loc[df["id"] == cluster_id])
else:
    raise ValueError("Unexpected columns; print(df.head()) and inspect.")


Index(['cluster_id', 'bc_unitType'], dtype='str')
    cluster_id bc_unitType
39          39         MUA


## Drill-down: why Bombcell labeled a specific cluster as MUA (or not GOOD)

This reports which thresholds are violated for a chosen `probe` and `cluster_id`.

In [31]:
probe = "B"          # <-- edit
cluster_id = 39      # <-- edit
run = "SINGLE"      # "DEFAULT" or "NP20" or 'SINGLE'

if run == "DEFAULT":
    d = default_data[probe] 
if run == "NP20":
    d = np20_data[probe] 
if run == "SINGLE":
    d = single_probe_data[probe] 

d.keys()

dict_keys(['probe', 'status', 'probe_dir', 'qm', 'counts', 'param', 'checks', 'cluster_id_col'])

In [None]:
probe = "B"          # <-- edit
cluster_id = 39      # <-- edit
run = "SINGLE"      # "DEFAULT" or "NP20" or 'SINGLE'


if run == "DEFAULT":
    d = default_data[probe] 
if run == "NP20":
    d = np20_data[probe] 
if run == "SINGLE":
    d = single_probe_data[probe] 


qm = d["qm"]
param = d["param"]
cluster_id_col = d.get("cluster_id_col", None)

print("Run:", run)

print("Probe:", probe)

row = find_cluster_row(qm,cluster_id,cluster_id_col)
print("Bombcell label:", row.get("Bombcell_unit_type", "UNKNOWN"))

fails = threshold_fail_report(row, qm.columns, param)

print("\n---- FAILING GATES ----")
if not fails:
    print("No fails among common checks; expand rules or inspect full row.")
else:
    for col, v, op, thr in fails:
        print(f"{col:35s} {v:>10.4f}  FAIL ({op}{thr})")

print("\n---- Key values ----")
key_cols = [
    "rawAmplitude","signalToNoiseRatio","presenceRatio",
    "fractionRPVs_estimatedTauR","percentageSpikesMissing_gaussian",
    "waveformDuration_peakTrough","nPeaks","nTroughs","waveformBaselineFlatness"
]
for c in key_cols:
    if c in qm.columns:
        print(f"{c:35s} {row[c]}")

Run: SINGLE
Probe: B
cluster_id_col: None


TypeError: find_cluster_row() missing 1 required positional argument: 'cluster_id_col'

## Distributions (RPV, presenceRatio)

In [None]:
import matplotlib.pyplot as plt

probe = "A"      # <-- edit
run = "NP20"     # "DEFAULT" or "NP20"

d = default_data[probe] if run == "DEFAULT" else np20_data[probe]
qm = d["qm"]

for col in ["fractionRPVs_estimatedTauR", "presenceRatio"]:
    if col in qm.columns:
        plt.figure()
        plt.hist(qm[col].dropna(), bins=50)
        plt.title(f"{probe} {run}: {col}")
        plt.xlabel(col); plt.ylabel("count")
        plt.show()

## Metric-by-label medians

In [None]:
probe = "A"      # <-- edit
run = "NP20"     # "DEFAULT" or "NP20"

d = default_data[probe] if run == "DEFAULT" else np20_data[probe]
qm = d["qm"]

metrics = ["fractionRPVs_estimatedTauR","presenceRatio","rawAmplitude","signalToNoiseRatio"]
present = [m for m in metrics if m in qm.columns]
qm.groupby("Bombcell_unit_type")[present].median()

## Compact overview table

In [None]:
def overview_table(data_dict: dict):
    rows = []
    for p, d in data_dict.items():
        if d.get("status") != "OK":
            rows.append({"probe": p, "status": "FAILED"})
            continue
        qm = d["qm"]
        counts = qm["Bombcell_unit_type"].value_counts()
        total = len(qm)
        row = {
            "probe": p,
            "status": "OK",
            "n_total": int(total),
            "n_GOOD": int(counts.get("GOOD",0)),
            "pct_GOOD": 100*float(counts.get("GOOD",0))/total if total else np.nan,
        }
        for m in ["fractionRPVs_estimatedTauR","presenceRatio","rawAmplitude","signalToNoiseRatio"]:
            if m in qm.columns:
                row[f"median_{m}"] = float(qm[m].median())
        rows.append(row)
    return pd.DataFrame(rows)

default_overview = overview_table(default_data)
np20_overview = overview_table(np20_data)

print("DEFAULT overview")
display(default_overview.sort_values("probe"))

print("NP2.0 rerun overview")
display(np20_overview.sort_values("probe"))