# Export Classification Reason to Phy

This notebook extracts the main classification reason for each unit from Bombcell results and exports it as a TSV file that can be viewed in Phy's cluster view tab.

The classification reason shows why each unit was classified as GOOD, NOISE, MUA, or NON-SOMA.

## Usage:
1. Set the configuration parameters below (RUN_MODE, TARGET_PROBE, etc.)
2. Run all cells
3. The notebook will create a `cluster_bc_classificationReason.tsv` file in your Kilosort directory
4. Open the data in Phy - you'll see a new "bc_classificationReason" column in the cluster view

## Configuration

In [1]:
# Configuration - CHANGE THESE VALUES FOR YOUR DATA
CONFIG_FILE = r'C:\Users\user\Documents\github\bombcell\py_bombcell\grant\configs\grant_recording_config_reach15_20260201_session007.json'
RUN_MODE = 'single_probe'  # 'batch', 'single_probe', or 'np20_rerun'
TARGET_PROBE = 'B'  # Only used for single_probe mode

# Optional: Set to True to see detailed information about each unit's classification
VERBOSE = True

## Setup and Load Data

In [2]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

# Add parent directory to path for grant_config import
sys.path.insert(0, str(Path.cwd().parent))
from grant_config import load_grant_config

import bombcell as bc

âœ… ipywidgets available - interactive GUI ready


In [3]:
# Load configuration
cfg = load_grant_config(CONFIG_FILE)

# Determine staging root based on run mode
if RUN_MODE == 'batch':
    staging_root = cfg['default_ks_staging_root']
elif RUN_MODE == 'np20_rerun':
    staging_root = cfg['np20_ks_staging_root']
else:  # single_probe
    staging_root = cfg['bombcell_singleprobe_root']

# Build paths
ks_dir = Path(staging_root) / f'kilosort4_{TARGET_PROBE}'
save_path = ks_dir / 'bombcell'

print('Kilosort directory:', ks_dir)
print('Bombcell save path:', save_path)
print()
print('TSV file will be saved to:', ks_dir / 'cluster_bc_classificationReason.tsv')

Kilosort directory: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B
Bombcell save path: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B\bombcell

TSV file will be saved to: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B\cluster_bc_classificationReason.tsv


## Verify current cluster_info.tsv 

In [13]:
from pathlib import Path
import pandas as pd

# --- 1) Confirm export contents ---
export_path = ks_dir / "cluster_bc_classificationReason.tsv"
print("EXPORT:", export_path)

exp = pd.read_csv(export_path, sep="\t")
print("Export columns:", exp.columns.tolist())
assert "cluster_id" in exp.columns
assert "bc_unitType" in exp.columns
assert "bc_classificationReason" in exp.columns
assert exp["cluster_id"].is_unique

bad_exp = exp[(exp["bc_unitType"] == "GOOD") & (exp["bc_classificationReason"].astype(str).str.startswith(("MUA:", "NOISE:", "NON-SOMA:")))]
print(f"Export: GOOD w/ non-GOOD reason = {len(bad_exp)}")
if len(bad_exp): display(bad_exp.head(20))

# --- 2) Find conflicting cluster_*.tsv sources ---
cluster_tsvs = sorted(ks_dir.glob("cluster_*.tsv"))
print("\ncluster_*.tsv files:")
for p in cluster_tsvs:
    try:
        cols = pd.read_csv(p, sep="\t", nrows=1).columns.tolist()
        hits = [c for c in ["bc_unitType", "bc_classificationReason"] if c in cols]
        tag = f"  <-- defines {hits}" if hits else ""
        print(f"  {p.name}{tag}")
    except Exception as e:
        print(f"  {p.name}  (could not read: {e})")

# --- 3) Compare Phy's cached cluster_info.tsv to export ---
cluster_info_path = ks_dir / "cluster_info.tsv"
print("\nCLUSTER_INFO:", cluster_info_path)

if cluster_info_path.exists():
    ci = pd.read_csv(cluster_info_path, sep="\t")
    # Phy uses 'id' in cluster_info.tsv; export uses 'cluster_id'
    if "id" in ci.columns:
        merged = ci.merge(
            exp.rename(columns={"cluster_id": "id"}),
            on="id",
            how="left",
            suffixes=("_phy", "_export"),
        )

        # show mismatches where both values exist
        m1 = merged["bc_unitType_phy"].astype(str) != merged["bc_unitType_export"].astype(str) if "bc_unitType_phy" in merged.columns else pd.Series(False, index=merged.index)
        m2 = merged["bc_classificationReason_phy"].astype(str) != merged["bc_classificationReason_export"].astype(str) if "bc_classificationReason_phy" in merged.columns else pd.Series(False, index=merged.index)

        mism = merged[m1 | m2].copy()
        cols_show = [c for c in ["id", "bc_unitType_phy", "bc_unitType_export", "bc_classificationReason_phy", "bc_classificationReason_export"] if c in mism.columns]
        print(f"Mismatched rows vs export: {len(mism)}")
        if len(mism): display(mism[cols_show].head(50))
    else:
        print("cluster_info.tsv has no 'id' column; cannot compare.")
else:
    print("cluster_info.tsv does not exist (Phy hasn't generated it yet).")


EXPORT: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B\cluster_bc_classificationReason.tsv
Export columns: ['cluster_id', 'bc_unitType', 'bc_classificationReason']
Export: GOOD w/ non-GOOD reason = 0

cluster_*.tsv files:
  cluster_Amplitude.tsv
  cluster_bc_classificationReason.tsv  <-- defines ['bc_unitType', 'bc_classificationReason']
  cluster_bc_unitType.tsv  <-- defines ['bc_unitType']
  cluster_ContamPct.tsv
  cluster_cumDriftEstimate.tsv
  cluster_fractionRPVs_estimatedTauR.tsv
  cluster_group.tsv
  cluster_isolationDistance.tsv
  cluster_KSLabel.tsv
  cluster_Lratio.tsv
  cluster_mainPeak_before_width.tsv
  cluster_mainPeakToTroughRatio.tsv
  cluster_mainTrough_width.tsv
  cluster_maxDriftEstimate.tsv
  cluster_nPeaks.tsv
  cluster_nTroughs.tsv
  cluster_peak1ToPeak2Ratio.tsv
  cluster_percentageSpikesMissing_gaussian.tsv
  cluster_percentageSpikesMissing_symmetric.tsv
  

## Load Bombcell Results

In [14]:
# Load Bombcell results
param, quality_metrics, _ = bc.load_bc_results(str(save_path))

# Get unit classifications
unit_type, unit_type_string = bc.qm.get_quality_unit_type(param, quality_metrics)

# Create DataFrame
qm_df = pd.DataFrame(quality_metrics).copy()
qm_df['bombcell_label'] = unit_type_string
qm_df['unit_index'] = np.arange(len(qm_df))

# Get cluster IDs
if 'cluster_id' not in qm_df.columns:
    if 'unique_templates' in param:
        qm_df['cluster_id'] = param['unique_templates']
    elif 'phy_clusterID' in quality_metrics:
        qm_df['cluster_id'] = quality_metrics['phy_clusterID']
    else:
        qm_df['cluster_id'] = qm_df['unit_index']

print(f'Loaded {len(qm_df)} units')
print('\nLabel distribution:')
print(qm_df['bombcell_label'].value_counts(dropna=False))

Loaded 1316 units

Label distribution:
bombcell_label
MUA         747
NON-SOMA    286
NOISE       247
GOOD         36
Name: count, dtype: int64


## Define Classification Reason Extraction Logic

In [15]:
def col(name):
    """Helper function to safely get column from DataFrame"""
    if name in qm_df.columns:
        return qm_df[name]
    return pd.Series(np.nan, index=qm_df.index)

# Define NOISE failure conditions
noise_fail = {
    'nPeaks>maxNPeaks': col('nPeaks') > param['maxNPeaks'],
    'nTroughs>maxNTroughs': col('nTroughs') > param['maxNTroughs'],
    'wvDuration<minWvDuration': col('waveformDuration_peakTrough') < param['minWvDuration'],
    'wvDuration>maxWvDuration': col('waveformDuration_peakTrough') > param['maxWvDuration'],
    'baselineFlatness>maxWvBaselineFraction': col('waveformBaselineFlatness') > param['maxWvBaselineFraction'],
    'scndPeakToTroughRatio>maxScndPeakToTroughRatio_noise': col('scndPeakToTroughRatio') > param['maxScndPeakToTroughRatio_noise'],
}

# Add spatial decay checks if computed
if bool(param.get('computeSpatialDecay', False)):
    if bool(param.get('spDecayLinFit', False)):
        noise_fail['spatialDecaySlope<minSpatialDecaySlope'] = col('spatialDecaySlope') < param['minSpatialDecaySlope']
    else:
        noise_fail['spatialDecaySlope<minSpatialDecaySlopeExp'] = col('spatialDecaySlope') < param['minSpatialDecaySlopeExp']
        noise_fail['spatialDecaySlope>maxSpatialDecaySlopeExp'] = col('spatialDecaySlope') > param['maxSpatialDecaySlopeExp']

# Define MUA failure conditions
mua_fail = {
    'percentageSpikesMissing_gaussian>maxPercSpikesMissing': col('percentageSpikesMissing_gaussian') > param['maxPercSpikesMissing'],
    'nSpikes<minNumSpikes': col('nSpikes') < param['minNumSpikes'],
    'fractionRPVs_estimatedTauR>maxRPVviolations': col('fractionRPVs_estimatedTauR') > param['maxRPVviolations'],
    'presenceRatio<minPresenceRatio': col('presenceRatio') < param['minPresenceRatio'],
}

# Add raw waveform checks if computed
if bool(param.get('extractRaw', False)):
    mua_fail['rawAmplitude<minAmplitude'] = col('rawAmplitude') < param['minAmplitude']
    mua_fail['signalToNoiseRatio<minSNR'] = col('signalToNoiseRatio') < param['minSNR']

# Add drift checks if computed
if bool(param.get('computeDrift', False)):
    mua_fail['maxDriftEstimate>maxDrift'] = col('maxDriftEstimate') > param['maxDrift']

# Add distance metric checks if computed
if bool(param.get('computeDistanceMetrics', False)):
    mua_fail['isolationDistance<isoDmin'] = col('isolationDistance') < param['isoDmin']
    mua_fail['Lratio>lratioMax'] = col('Lratio') > param['lratioMax']

# Define NON-SOMA failure conditions
non_soma_fail = {
    'troughToPeak2Ratio<minTroughToPeak2Ratio_nonSomatic': col('troughToPeak2Ratio') < param['minTroughToPeak2Ratio_nonSomatic'],
    'mainPeak_before_width<minWidthFirstPeak_nonSomatic': col('mainPeak_before_width') < param['minWidthFirstPeak_nonSomatic'],
    'mainTrough_width<minWidthMainTrough_nonSomatic': col('mainTrough_width') < param['minWidthMainTrough_nonSomatic'],
    'peak1ToPeak2Ratio>maxPeak1ToPeak2Ratio_nonSomatic': col('peak1ToPeak2Ratio') > param['maxPeak1ToPeak2Ratio_nonSomatic'],
    'mainPeakToTroughRatio>maxMainPeakToTroughRatio_nonSomatic': col('mainPeakToTroughRatio') > param['maxMainPeakToTroughRatio_nonSomatic'],
}

print('Defined classification rules:')
print(f'  NOISE: {len(noise_fail)} criteria')
print(f'  MUA: {len(mua_fail)} criteria')
print(f'  NON-SOMA: {len(non_soma_fail)} criteria')

Defined classification rules:
  NOISE: 8 criteria
  MUA: 6 criteria
  NON-SOMA: 5 criteria


## Extract Classification Reasons

In [16]:
# Convert to numpy arrays for faster processing
noise_fail_np = {k: np.asarray(v, dtype=bool) for k, v in noise_fail.items()}
mua_fail_np = {k: np.asarray(v, dtype=bool) for k, v in mua_fail.items()}
nonsoma_fail_np = {k: np.asarray(v, dtype=bool) for k, v in non_soma_fail.items()}
labels_np = qm_df['bombcell_label'].astype(str).to_numpy()

def get_main_reason(i: int) -> str:
    """
    Get the main (first) classification reason for a unit.
    
    Returns the primary reason why the unit was classified as NOISE, MUA, NON-SOMA, or GOOD.
    """
    label = labels_np[i]
    
    # Find which criteria failed for this unit
    noise_hits = [k for k, v in noise_fail_np.items() if bool(v[i])]
    mua_hits = [k for k, v in mua_fail_np.items() if bool(v[i])]
    nonsoma_hits = [k for k, v in nonsoma_fail_np.items() if bool(v[i])]
    
    # Return the first (main) reason based on the label
    if label == 'NOISE':
        if noise_hits:
            return f'NOISE: {noise_hits[0]}'
        return 'NOISE'
    
    elif label in ('MUA', 'NON-SOMA MUA'):
        if mua_hits:
            return f'MUA: {mua_hits[0]}'
        return 'MUA'
    
    elif label in ('NON-SOMA', 'NON-SOMA GOOD'):
        if nonsoma_hits:
            return f'NON-SOMA: {nonsoma_hits[0]}'
        return 'NON-SOMA'
    
    elif label == 'GOOD':
        return 'GOOD: passed all thresholds'
    
    else:
        return f'{label}'

def get_all_reasons(i: int) -> str:
    """
    Get all classification reasons for a unit (for verbose output).
    
    Returns all reasons separated by ' | '.
    """
    label = labels_np[i]
    reasons = []

    noise_hits = [k for k, v in noise_fail_np.items() if bool(v[i])]
    mua_hits = [k for k, v in mua_fail_np.items() if bool(v[i])]
    nonsoma_hits = [k for k, v in nonsoma_fail_np.items() if bool(v[i])]

    if label == 'NOISE':
        reasons.extend([f'NOISE: {r}' for r in noise_hits] or ['NOISE'])
    elif label in ('MUA', 'NON-SOMA MUA'):
        reasons.extend([f'MUA: {r}' for r in mua_hits] or ['MUA'])
    elif label in ('NON-SOMA', 'NON-SOMA GOOD'):
        reasons.extend([f'NON-SOMA: {r}' for r in nonsoma_hits] or ['NON-SOMA'])

    if label == 'GOOD':
        reasons.append('GOOD: passed all thresholds')

    return ' | '.join(reasons) if reasons else label

# Extract main reason for each unit
qm_df['main_reason'] = [get_main_reason(i) for i in range(len(qm_df))]

print('Classification reasons extracted!')
print(f'\nExample reasons (first 5 units):')
for i in range(min(5, len(qm_df))):
    print(f"  Unit {i} ({labels_np[i]}): {qm_df['main_reason'].iloc[i]}")

Classification reasons extracted!

Example reasons (first 5 units):
  Unit 0 (MUA): MUA: fractionRPVs_estimatedTauR>maxRPVviolations
  Unit 1 (MUA): MUA: fractionRPVs_estimatedTauR>maxRPVviolations
  Unit 2 (NOISE): NOISE: scndPeakToTroughRatio>maxScndPeakToTroughRatio_noise
  Unit 3 (MUA): MUA: fractionRPVs_estimatedTauR>maxRPVviolations
  Unit 4 (GOOD): GOOD: passed all thresholds


In [17]:
# NEW CODE: build + export Phy cluster TSV with BOTH columns (and sanity checks)

# 1) Build export table keyed by cluster_id (Phy merges by id/cluster_id)
export_df = pd.DataFrame({
    "cluster_id": qm_df["cluster_id"].astype(int),
    "bc_unitType": qm_df["bombcell_label"].astype(str),
    "bc_classificationReason": qm_df["main_reason"].astype(str),
})

# 2) Sanity checks (prevents silent misalignment)
assert export_df["cluster_id"].is_unique, "Duplicate cluster_id in qm_df/export_df; Phy merge will be ambiguous"

# 3) (Optional but recommended) enforce definitional consistency:
# if you want GOOD units to never show MUA/NOISE reasons in Phy, uncomment:
# export_df["bc_classificationReason"] = export_df["bc_classificationReason"].where(
#     export_df["bc_unitType"] != "GOOD", "GOOD: passed all thresholds"
# )

# 4) Quick mismatch report inside export_df
bad = export_df[
    (export_df["bc_unitType"] == "GOOD")
    & (export_df["bc_classificationReason"].str.startswith(("MUA:", "NOISE:", "NON-SOMA:")))
]
print(f"GOOD units with non-GOOD reason in export: {len(bad)}")
if len(bad):
    display(bad.head(20))

# 5) Write TSV for Phy
out_path = ks_dir / "cluster_bc_classificationReason.tsv"
export_df.to_csv(out_path, sep="\t", index=False)
print(f"Wrote: {out_path}")
display(export_df.head(20))


df = pd.read_csv(ks_dir / "cluster_bc_classificationReason.tsv", sep="\t")

bad = df[(df["bc_unitType"] == "GOOD") & (df["bc_classificationReason"].astype(str).str.startswith(("MUA:", "NOISE:", "NON-SOMA:")))]
print(f"GOOD units with non-GOOD reason in TSV: {len(bad)}")
if len(bad):
    display(bad.head(20))

# Run with Phy CLOSED
cluster_info_path = ks_dir / "cluster_info.tsv"
if cluster_info_path.exists():
    cluster_info_path.unlink()
    print(f"Deleted {cluster_info_path} (Phy will regenerate on next open).")
else:
    print("cluster_info.tsv not found (nothing to delete).")

GOOD units with non-GOOD reason in export: 0
Wrote: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B\cluster_bc_classificationReason.tsv


Unnamed: 0,cluster_id,bc_unitType,bc_classificationReason
0,0,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
1,1,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
2,2,NOISE,NOISE: scndPeakToTroughRatio>maxScndPeakToTrou...
3,3,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
4,4,GOOD,GOOD: passed all thresholds
5,5,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
6,6,NON-SOMA,NON-SOMA: troughToPeak2Ratio<minTroughToPeak2R...
7,7,NOISE,NOISE: scndPeakToTroughRatio>maxScndPeakToTrou...
8,8,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
9,9,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations


GOOD units with non-GOOD reason in TSV: 0
cluster_info.tsv not found (nothing to delete).


## View Classification Reason Summary

In [18]:
# Show distribution of main reasons
print('Main reason distribution:')
print(qm_df['main_reason'].value_counts())
print()

# Show example units for each label type
if VERBOSE:
    print('\n' + '='*80)
    print('DETAILED VIEW: Example units for each classification')
    print('='*80)
    
    for label in ['GOOD', 'NOISE', 'MUA', 'NON-SOMA', 'NON-SOMA GOOD', 'NON-SOMA MUA']:
        subset = qm_df[qm_df['bombcell_label'] == label]
        if len(subset) > 0:
            print(f'\n{label} units (showing up to 3 examples):')
            for idx in subset.index[:3]:
                cluster_id = qm_df.loc[idx, 'cluster_id']
                main_reason = qm_df.loc[idx, 'main_reason']
                all_reasons = get_all_reasons(idx)
                print(f'  Cluster {cluster_id}:')
                print(f'    Main: {main_reason}')
                if ' | ' in all_reasons:
                    print(f'    All:  {all_reasons}')
                print()

Main reason distribution:
main_reason
MUA: fractionRPVs_estimatedTauR>maxRPVviolations                 573
NON-SOMA: troughToPeak2Ratio<minTroughToPeak2Ratio_nonSomatic    286
NOISE: scndPeakToTroughRatio>maxScndPeakToTroughRatio_noise      130
MUA: presenceRatio<minPresenceRatio                              106
NOISE: nTroughs>maxNTroughs                                       61
NOISE: spatialDecaySlope<minSpatialDecaySlopeExp                  39
GOOD: passed all thresholds                                       36
MUA: rawAmplitude<minAmplitude                                    26
MUA: percentageSpikesMissing_gaussian>maxPercSpikesMissing        23
MUA: nSpikes<minNumSpikes                                         19
NOISE: nPeaks>maxNPeaks                                           11
NOISE                                                              3
NOISE: wvDuration<minWvDuration                                    2
NOISE: spatialDecaySlope>maxSpatialDecaySlopeExp                 

## Export to Phy TSV File

In [22]:
# OLD CODE
# (whatever export code you currently have here that writes cluster_bc_classificationReason.tsv)


# NEW CODE: Full export cell (unit type + reason) + remove conflicting TSV
import pandas as pd

# Build export table keyed by cluster_id (Phy merges by id/cluster_id)
export_df = pd.DataFrame({
    "cluster_id": qm_df["cluster_id"].astype(int),
    "bc_unitType": qm_df["bombcell_label"].astype(str),
    "bc_classificationReason": qm_df["main_reason"].astype(str),
})

# Sanity checks (avoid ambiguous merges)
assert export_df["cluster_id"].is_unique, "Duplicate cluster_id in qm_df/export_df; Phy merge will be ambiguous"

# Optional: enforce definitional consistency (uncomment if desired)
# export_df["bc_classificationReason"] = export_df["bc_classificationReason"].where(
#     export_df["bc_unitType"] != "GOOD", "GOOD: passed all thresholds"
# )

# Quick mismatch report inside export_df
bad = export_df[
    (export_df["bc_unitType"] == "GOOD")
    & (export_df["bc_classificationReason"].str.startswith(("MUA:", "NOISE:", "NON-SOMA:")))
]
print(f"GOOD units with non-GOOD reason in export: {len(bad)}")
if len(bad):
    display(bad.head(20))

# Write combined TSV for Phy
out_path = ks_dir / "cluster_bc_classificationReason.tsv"
export_df.to_csv(out_path, sep="\t", index=False)
print("Wrote:", out_path)

# Remove conflicting single-column TSV so Phy can't mix sources
conflict_path = ks_dir / "cluster_bc_unitType.tsv"
if conflict_path.exists():
    conflict_path.unlink()
    print("Deleted conflicting file:", conflict_path)
else:
    print("No conflicting file found:", conflict_path)

# Preview
display(export_df.head(20))

# NEW CODE: Add ROI label into the SAME TSV for Phy

# --- Get ROI labels into a dict: cluster_id -> "IN_ROI"/"OUT_ROI" ---
if "roi_label_by_cluster_id" in globals():
    roi_map = roi_label_by_cluster_id

elif "roi_df" in globals():
    roi_map = dict(zip(roi_df["cluster_id"].astype(int), roi_df["roi_label"].astype(str)))

else:
    raise ValueError(
        "Could not find ROI labels. Need either:\n"
        "  roi_label_by_cluster_id (dict)\n"
        "or\n"
        "  roi_df with columns ['cluster_id','roi_label']"
    )

# --- Build export table ---
export_df = pd.DataFrame({
    "cluster_id": qm_df["cluster_id"].astype(int),
    "bc_unitType": qm_df["bombcell_label"].astype(str),
    "bc_classificationReason": qm_df["main_reason"].astype(str),
})

# --- Add ROI column ---
export_df["bc_ROI"] = export_df["cluster_id"].map(roi_map).fillna("UNKNOWN")

# --- Write TSV ---
out_path = ks_dir / "cluster_bc_classificationReason.tsv"
export_df.to_csv(out_path, sep="\t", index=False)
print("Wrote:", out_path)

# --- Preview ---
display(export_df[["cluster_id", "bc_unitType", "bc_ROI", "bc_classificationReason"]].head(30))


GOOD units with non-GOOD reason in export: 0
Wrote: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B\cluster_bc_classificationReason.tsv
No conflicting file found: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B\cluster_bc_unitType.tsv


Unnamed: 0,cluster_id,bc_unitType,bc_classificationReason
0,0,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
1,1,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
2,2,NOISE,NOISE: scndPeakToTroughRatio>maxScndPeakToTrou...
3,3,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
4,4,GOOD,GOOD: passed all thresholds
5,5,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
6,6,NON-SOMA,NON-SOMA: troughToPeak2Ratio<minTroughToPeak2R...
7,7,NOISE,NOISE: scndPeakToTroughRatio>maxScndPeakToTrou...
8,8,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations
9,9,MUA,MUA: fractionRPVs_estimatedTauR>maxRPVviolations


ValueError: Could not find ROI labels. Need either:
  roi_label_by_cluster_id (dict)
or
  roi_df with columns ['cluster_id','roi_label']

## Preview the Exported Data

In [20]:
# NEW CODE
cluster_info_path = ks_dir / "cluster_info.tsv"

if cluster_info_path.exists():
    cluster_info_df = pd.read_csv(cluster_info_path, sep="\t")

    # Columns to prioritize
    priority_cols = ["id", "ch", "bc_unitType", "KSLabel", "bc_classificationReason"]

    # Keep only columns that actually exist
    existing_priority = [c for c in priority_cols if c in cluster_info_df.columns]

    # Append remaining columns
    remaining_cols = [c for c in cluster_info_df.columns if c not in existing_priority]

    reordered_df = cluster_info_df[existing_priority + remaining_cols]

    print("cluster_info.tsv columns:")
    print(reordered_df.columns.tolist())
    print("\ncluster_info.tsv preview:")
    display(reordered_df.head(20))
else:
    print(f"cluster_info.tsv not found at: {cluster_info_path}")
    print("Phy may not have saved cluster_info.tsv yet (open Phy once and close it).")

cluster_info.tsv not found at: H:\Grant\Neuropixels\Kilosort_Recordings\Reach15_20260201_session007_NP_Recording_Number02_2026-02-01_18-25-00\bombcell\bombcell_single_probe\kilosort4_B\cluster_info.tsv
Phy may not have saved cluster_info.tsv yet (open Phy once and close it).


In [21]:
# Add RIGHT AFTER you create export_df (before to_csv)

# sanity: unique ids
assert export_df["cluster_id"].is_unique, "Duplicate cluster_id in export_df"

# if you have a qm_df row per cluster, this should also be unique
assert qm_df["cluster_id"].is_unique, "Duplicate cluster_id in qm_df (unexpected)"

# show any internal inconsistencies you may have introduced
bad = export_df[(export_df["bc_unitType"] == "GOOD") & (export_df["bc_classificationReason"].astype(str).str.startswith("MUA"))]
display(bad.head(20))
print(f"GOOD with MUA reason: {len(bad)}")

Unnamed: 0,cluster_id,bc_unitType,bc_classificationReason


GOOD with MUA reason: 0
