## Verifying Exception Cases of OCED PFAS in Merged Dataset

In [None]:
import pandas as pd

with open("/teamspace/studios/this_studio/files/merged_massspec_nist20_mona_fold.tsv", "r") as f:
    df_merged = pd.read_csv(f, sep='\t')

df_merged.head(3)

In [4]:
from rdkit import Chem
from rdkit.Chem import rdchem

# Definition of PFAS  based on OECD: https://pubs.acs.org/doi/10.1021/acs.est.1c06896
def is_pfas(smiles: str) -> int:
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return 0

        for atom in mol.GetAtoms():
            if atom.GetAtomicNum() != 6:  # carbon only
                continue

            # neighbors
            neigh = atom.GetNeighbors()
            sym = [n.GetSymbol() for n in neigh]

            num_F = sum(1 for s in sym if s == "F")
            has_X  = any(s in ("Cl", "Br", "I") for s in sym)
            has_H  = atom.GetTotalNumHs() > 0  # implicit + explicit Hs

            # require sp3 and all single bonds (rules out alkenes like TFE)
            is_sp3 = atom.GetHybridization() == rdchem.HybridizationType.SP3
            all_single = all(
                mol.GetBondBetweenAtoms(atom.GetIdx(), n.GetIdx()).GetBondType() == rdchem.BondType.SINGLE
                for n in neigh
            )

            # CF3: at least 3 F neighbors; CF2: at least 2 F neighbors
            if (num_F >= 3 or num_F >= 2) and is_sp3 and all_single and not has_H and not has_X:
                # For CF2, make sure there's at least one non-F neighbor so it's truly "-CF2-"
                if num_F >= 3:
                    return 1
                else:  # CF2
                    nonF_neighbors = sum(1 for s in sym if s != "F")
                    if nonF_neighbors >= 1:  # "-CF2-" has something other than F attached
                        return 1

        return 0
    except Exception:
        return 0


In [None]:
df_merged["is_pfas"] = df_merged["smiles"].apply(is_pfas)
pfas_df = df_merged[df_merged["is_pfas"] == 1]
df_train = pfas_df[pfas_df["fold"] == "train"]
print("train PFAS spectra " + str(len(df_train)))
df_val = pfas_df[pfas_df["fold"] == "val"]
print("val PFAS spectra " + str(len(df_val)))

In [None]:
print("train PFAS spectra " + str(df_train["smiles"].nunique()))
print("val PFAS spectra " + str(df_val["smiles"].nunique()))

In [None]:
#pfas #1
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw

# Select 10 PFAS molecules at random
pfas_sample = df_merged.loc[df_merged["is_pfas"] == 1, "smiles"].sample(n=50, random_state=42)

# Build dataframe with SMILES + RDKit Mol
pfas_selected = []
for smi in pfas_sample:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        pfas_selected.append({"smiles": smi, "mol": mol})

pfas_df = pd.DataFrame(pfas_selected)

# Function to draw molecules in batches of 5
def draw_batch(df, start_idx, batch_size=5):
    subset = df.iloc[start_idx:start_idx+batch_size]
    mols = subset["mol"].tolist()
    legends = subset["smiles"].tolist()
    return Draw.MolsToGridImage(mols, molsPerRow=batch_size, subImgSize=(300,300), legends=legends)

# Example usage:
img1 = draw_batch(pfas_df, 0)   # first 5
img2 = draw_batch(pfas_df, 5)   # next 5
img3 = draw_batch(pfas_df, 10)
img4 = draw_batch(pfas_df, 49)
img1


In [None]:
#non-pfas #1
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw

# Filter non-PFAS molecules, remove duplicates
non_pfas = df_merged.loc[df_merged["is_pfas"] == 0, "smiles"].drop_duplicates()

selected = []

for smi in non_pfas:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        # Count fluorine atoms
        fluorine_count = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == "F")
        if fluorine_count > 1:
            selected.append({"smiles": smi, "mol": mol, "fluorine_count": fluorine_count})
    if len(selected) == 30:
        break

# Put into a dataframe
selected_df = pd.DataFrame(selected)

# Function to draw 5 molecules at a time
def draw_batch(df, start_idx, batch_size=5):
    subset = df.iloc[start_idx:start_idx+batch_size]
    mols = subset["mol"].tolist()
    legends = [f"{row.smiles} | F count: {row.fluorine_count}" for row in subset.itertuples()]
    return Draw.MolsToGridImage(mols, molsPerRow=batch_size, subImgSize=(300,300), legends=legends)

# Example: draw first batch (5 molecules)
img1 = draw_batch(selected_df, 0)
img2 = draw_batch(selected_df, 5)
img3 = draw_batch(selected_df, 10)
img4 = draw_batch(selected_df, 15)
img1

## Create Oversampled Dataset


In [None]:
import pandas as pd
from rdkit import Chem

# Example: Load your dataframe
pfas_labeled_df = pd.read_csv('/teamspace/studios/this_studio/files/merged_massspec_nist20_with_pfas_fold.tsv', sep='\t')

pfas_labeled_df = pfas_labeled_df.drop(columns=["Unnamed: 0.1", "Unnamed: 0"], errors="ignore")

In [None]:
pfas_labeled_df.tail(10)

In [None]:
import pandas as pd
df = pd.read_csv('/teamspace/studios/this_studio/files/merged_massspec_nist20_with_pfas_fold.tsv', sep='\t')
print(df.iloc[0])

import pandas as pd

# --- Step 1: Record original size ---
orig_size = pfas_labeled_df.shape[0]

# --- Step 2: Filter PFAS entries ---
pfas_only = pfas_labeled_df[pfas_labeled_df["is_PFAS"] == 1].copy()

# --- Step 3: Count spectra per formula ---
formula_counts = pfas_only["formula"].value_counts()

# --- Step 4: Oversample PFAS formulas with < 50 spectra ---
min_spectra = 50
oversampled_pfas = []
new_id_counter = 0  # Counter to generate new identifiers

for formula, count in formula_counts.items():
    subset = pfas_only[pfas_only["formula"] == formula]
    if count < min_spectra:
        n_to_sample = min_spectra - count
        sampled = subset.sample(n=n_to_sample, replace=True, random_state=42)
        
        # --- Assign new unique identifiers to oversampled rows ---
        sampled = sampled.copy()
        sampled["identifier"] = [
            f"{ident}_oversampled_{idx+1}"
            for idx, ident in enumerate(sampled["identifier"])
        ]

        subset = pd.concat([subset, sampled], ignore_index=True)
    oversampled_pfas.append(subset)

# Combine oversampled PFAS
pfas_oversampled = pd.concat(oversampled_pfas, ignore_index=True)

# --- Step 5: Merge oversampled PFAS back with the rest of the dataset ---
pfas_labeled_balanced_df = pd.concat(
    [pfas_labeled_df[pfas_labeled_df["is_PFAS"] == 0], pfas_oversampled],
    ignore_index=True
)

# --- Step 6: Print summary ---
new_size = pfas_labeled_balanced_df.shape[0]

print(f"Original dataset size: {orig_size}")
print(f"New dataset size after PFAS oversampling: {new_size}")
print(f"Added rows: {new_size - orig_size}")

# --- Step 7: Sanity checks ---
# Ensure identifiers are unique
n_unique_ids = pfas_labeled_balanced_df["identifier"].nunique()
print(f"\nUnique identifiers after balancing: {n_unique_ids}")

# Optional: verify all PFAS formulas now have ≥50 spectra
pfas_check = (
    pfas_labeled_balanced_df[pfas_labeled_balanced_df["is_PFAS"] == 1]
    .groupby("formula").size().reset_index(name="n_spectra")
)
print("\nPFAS spectra per formula (after oversampling):")
display(pfas_check.sort_values("n_spectra"))


In [None]:
pfas_labeled_balanced_df.tail(50)

In [None]:
# Save to TSV (tab-separated file)
output_path = "/teamspace/studios/this_studio/files/merged_massspec_nist20_pfas_labeled_oversampled_df.tsv"
pfas_labeled_balanced_df.to_csv(output_path, sep="\t", index=False)

print(f"✅ Balanced dataset saved to: {output_path}")
print(f"Total rows written: {pfas_labeled_balanced_df.shape[0]}")


## Inspect False Negatives from PFAS training

In [None]:
# diagnostics_tp_fn.py
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors, rdMolDescriptors, Crippen
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
import os
from PIL import Image

# ---------- User inputs ----------
# pfas_labeled_df: your DataFrame already in memory
# tp_ids: list of identifiers considered True Positives (strings)
# fn_ids: list of identifiers considered False Negatives (strings)
#
# Example placeholders (you'll replace these lists with real ones)
tp_ids = [
    "NIST20_3349061",
    "NIST20_3197688",
    "NIST20_1954706",
    "MassSpecGymID0032207",
    "MassSpecGymID0032224",
    "NIST20_3189483",
    "NIST20_3368685",
    "NIST20_3396690",
    "NIST20_3400320",
    "NIST20_1339338",
    "NIST20_1706666",
    "NIST20_3444702",
    "NIST20_3368723",
    "NIST20_1162721",
    "NIST20_3349049",
    "NIST20_1605487",
    "NIST20_1867165",
    "NIST20_1605146",
    "NIST20_3396705",
    "NIST20_1797469",
    "NIST20_3042258",
    "NIST20_3258839",
    "NIST20_3222033",
    "MassSpecGymID0039672",
    "NIST20_3042264",
    "NIST20_3190676",
    "NIST20_3253542",
    "NIST20_3471394",
    "NIST20_1435058",
    "NIST20_3130269"
]


fn_ids = [
    "MassSpecGymID0091026","NIST20_1480275","NIST20_1728579","MassSpecGymID0209908","MassSpecGymID0212883",
    "NIST20_1713101","MassSpecGymID0214836","NIST20_3433004","NIST20_1778823","NIST20_3137787",
    "NIST20_1155792","MassSpecGymID0219350","NIST20_3330498","NIST20_1589147","NIST20_1328300",
    "NIST20_3222024","NIST20_1417984","NIST20_1555121","NIST20_1402971","MassSpecGymID0039702",
    "NIST20_1251439","MassSpecGymID0081084","NIST20_3263875","NIST20_1506767","NIST20_3195296",
    "MassSpecGymID0235891","MassSpecGymID0179665","NIST20_1714898","MassSpecGymID0039673","NIST20_1605254"
]

#
# Options:
FIX_LABELS_IF_MISMATCH = False   # if True, set is_PFAS=1 for any IDs that don't match expected
OUTDIR = "tp_fn_diagnostics"     # where images and CSVs are saved
IMAGE_DPI = 150
FP_RADIUS = 2
FP_NBITS = 2048

os.makedirs(OUTDIR, exist_ok=True)

# ---------- Helper functions ----------
def safe_get_row_by_identifier(df, identifier_col, ident):
    rows = df[df[identifier_col] == ident]
    if len(rows) == 0:
        return None
    return rows.iloc[0]

def smiles_to_mol(smiles):
    if pd.isna(smiles) or smiles == "":
        return None
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            rdMolDescriptors.CalcMolFormula(mol)  # ensure valid
        return mol
    except Exception:
        return None

def compute_descriptors(mol, smiles=None):
    if mol is None:
        return {
            "MolWt": np.nan,
            "NumF": np.nan,
            "NumHeavy": np.nan,
            "NumRotatable": np.nan,
            "NumAromaticRings": np.nan,
            "LogP": np.nan
        }
    # molecular weight
    mw = Descriptors.ExactMolWt(mol)  # monoisotopic mass
    # count fluorine atoms
    numF = sum(1 for a in mol.GetAtoms() if a.GetSymbol() == "F")
    # heavy atom count
    heavy = rdMolDescriptors.CalcNumHeavyAtoms(mol)
    # rotatable bonds
    rot = rdMolDescriptors.CalcNumRotatableBonds(mol)
    # aromatic rings
    rings = rdMolDescriptors.CalcNumAromaticRings(mol)
    # logP
    logp = Crippen.MolLogP(mol)
    return {
        "MolWt": float(mw),
        "NumF": int(numF),
        "NumHeavy": int(heavy),
        "NumRotatable": int(rot),
        "NumAromaticRings": int(rings),
        "LogP": float(logp)
    }

def mol_to_fp(mol, radius=FP_RADIUS, nBits=FP_NBITS):
    if mol is None:
        return None
    return GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)

def fp_tanimoto(fp1, fp2):
    if fp1 is None or fp2 is None:
        return np.nan
    return float(TanimotoSimilarity(fp1, fp2))

def draw_mols_grid(mols, legends=None, outpath=None, molsPerRow=5, subImgSize=(300,300)):
    """
    Draws RDKit molecules in a grid and saves to outpath (PNG).
    """
    mols_clean = [m if m is not None else Chem.MolFromSmiles("") for m in mols]
    img = Draw.MolsToGridImage(mols_clean, molsPerRow=molsPerRow, subImgSize=subImgSize, legends=legends, useSVG=False)
    if outpath:
    # Convert to PNG file
        with open(outpath, "wb") as f:
            f.write(img.data)
        #img.save(outpath, dpi=(IMAGE_DPI, IMAGE_DPI))
    return img

# ---------- 1) Verify/fix labels ----------
def verify_and_fix_labels(df, tp_ids, fn_ids, identifier_col="identifier", fix=FIX_LABELS_IF_MISMATCH):
    mismatches = []
    for label_list, label_name in [(tp_ids, "TP"), (fn_ids, "FN")]:
        for ident in label_list:
            row = safe_get_row_by_identifier(df, identifier_col, ident)
            if row is None:
                mismatches.append((ident, label_name, "MISSING"))
                continue
            is_pfas_val = int(row.get("is_PFAS", 0))
            if is_pfas_val != 1:
                mismatches.append((ident, label_name, is_pfas_val))
                if fix:
                    df.loc[df[identifier_col] == ident, "is_PFAS"] = 1
    mismatches_df = pd.DataFrame(mismatches, columns=["identifier", "expected_label_type", "found_is_PFAS"])
    return mismatches_df

# ---------- 2) Build molecules and descriptors table ----------
def build_mols_and_descriptors(df, ids, identifier_col="identifier"):
    records = []
    for ident in ids:
        row = safe_get_row_by_identifier(df, identifier_col, ident)
        if row is None:
            records.append({"identifier": ident, "found": False})
            continue
        smiles = row.get("smiles", None)
        mol = smiles_to_mol(smiles)
        desc = compute_descriptors(mol, smiles)
        fp = mol_to_fp(mol)
        record = {
            "identifier": ident,
            "found": True,
            "smiles": smiles,
            "mol": mol,
            "fp": fp,
            **desc,
            # carry context columns if present:
            "precursor_mz": row.get("precursor_mz", np.nan),
            "formula": row.get("formula", None),
            "adduct": row.get("adduct", None),
        }
        records.append(record)
    df_out = pd.DataFrame(records)
    return df_out

# ---------- 3) Similarity analysis: for each FN find nearest TP ----------
def nearest_tp_for_each_fn(df_tp, df_fn):
    rows = []
    for _, fn_row in df_fn.iterrows():
        best_sim = -1.0
        best_tp_idx = None
        for _, tp_row in df_tp.iterrows():
            sim = fp_tanimoto(fn_row['fp'], tp_row['fp'])
            if np.isnan(sim):
                continue
            if sim > best_sim:
                best_sim = sim
                best_tp_idx = tp_row['identifier']
        rows.append({
            "fn_id": fn_row['identifier'],
            "best_tp_id": best_tp_idx,
            "best_tanimoto": best_sim,
            "fn_numF": fn_row.get("NumF"),
            "tp_numF": df_tp.loc[df_tp['identifier'] == best_tp_idx, 'NumF'].iloc[0] if best_tp_idx in df_tp['identifier'].values else np.nan,
            "fn_mw": fn_row.get("MolWt"),
            "tp_mw": df_tp.loc[df_tp['identifier'] == best_tp_idx, 'MolWt'].iloc[0] if best_tp_idx in df_tp['identifier'].values else np.nan,
            "fn_precursor_mz": fn_row.get("precursor_mz"),
            "tp_precursor_mz": df_tp.loc[df_tp['identifier'] == best_tp_idx, 'precursor_mz'].iloc[0] if best_tp_idx in df_tp['identifier'].values else np.nan,
            "fn_formula": fn_row.get("formula"),
            "tp_formula": df_tp.loc[df_tp['identifier'] == best_tp_idx, 'formula'].iloc[0] if best_tp_idx in df_tp['identifier'].values else None
        })
    return pd.DataFrame(rows)

# ---------- 4) Main orchestration ----------
def run_diagnostics(pfas_df, tp_ids, fn_ids, identifier_col="identifier", outdir=OUTDIR, n_draw_each=50):
    os.makedirs(outdir, exist_ok=True)

    # 1) Verify & (optionally) fix labels
    mismatches_df = verify_and_fix_labels(pfas_df, tp_ids, fn_ids, identifier_col=identifier_col, fix=FIX_LABELS_IF_MISMATCH)
    print("--mismatches = " + str(len(mismatches_df)))
    mismatches_df.to_csv(os.path.join(outdir, "label_mismatches.csv"), index=False)

    # 2) Build TP / FN molecule tables with descriptors
    df_tp = build_mols_and_descriptors(pfas_df, tp_ids, identifier_col=identifier_col)
    df_fn = build_mols_and_descriptors(pfas_df, fn_ids, identifier_col=identifier_col)

    # save descriptor tables (without heavy RDKit objects)
    df_tp_drop = df_tp.drop(columns=["mol", "fp"]).copy()
    df_fn_drop = df_fn.drop(columns=["mol", "fp"]).copy()
    df_tp_drop.to_csv(os.path.join(outdir, "tp_descriptors.csv"), index=False)
    df_fn_drop.to_csv(os.path.join(outdir, "fn_descriptors.csv"), index=False)

    # 3) Draw SMILES images (top N or all)
    tp_to_draw = df_tp.head(n_draw_each)
    fn_to_draw = df_fn.head(n_draw_each)

    tp_mols = tp_to_draw["mol"].tolist()
    tp_legends = [f"{row.identifier}\nF={row.NumF}, MW={row.MolWt:.1f}" if row.found else row.identifier for _, row in tp_to_draw.iterrows()]
    tp_img_path = os.path.join(outdir, "tp_mols_grid.png")
    draw_mols_grid(tp_mols, legends=tp_legends, outpath=tp_img_path)

    fn_mols = fn_to_draw["mol"].tolist()
    fn_legends = [f"{row.identifier}\nF={row.NumF}, MW={row.MolWt:.1f}" if row.found else row.identifier for _, row in fn_to_draw.iterrows()]
    fn_img_path = os.path.join(outdir, "fn_mols_grid.png")
    draw_mols_grid(fn_mols, legends=fn_legends, outpath=fn_img_path)

    # 4) Similarity analysis
    sim_df = nearest_tp_for_each_fn(df_tp, df_fn)
    sim_df.to_csv(os.path.join(outdir, "fn_to_nearest_tp_similarity.csv"), index=False)

    # 5) Produce a short textual report
    report_lines = []
    report_lines.append("PFAS TP/FN Diagnostic Report\n")
    report_lines.append(f"Total TP provided: {len(df_tp)}")
    report_lines.append(f"Total FN provided: {len(df_fn)}")
    report_lines.append("\nLabel mismatches (if any) saved to label_mismatches.csv\n")
    report_lines.append(f"TP molecule grid: {tp_img_path}")
    report_lines.append(f"FN molecule grid: {fn_img_path}")
    report_lines.append("Similarity file: fn_to_nearest_tp_similarity.csv\n")
    # Summarize simple statistics: e.g., fraction of FNs with low fluorine count
    fn_low_F_frac = np.mean([1 if (row.NumF is not None and row.NumF <= 1) else 0 for _, row in df_fn.iterrows()])
    report_lines.append(f"Fraction of FNs with <=1 fluorine atoms: {fn_low_F_frac:.3f}")

    report_path = os.path.join(outdir, "diagnostic_report.txt")
    with open(report_path, "w") as f:
        f.write("\n".join(report_lines))

    print("Diagnostics complete.")
    print("\n".join(report_lines))

    return {
        "df_tp": df_tp,
        "df_fn": df_fn,
        "sim_df": sim_df,
        "outdir": outdir
    }

# ---------- Example usage (do not run until you set tp_ids/fn_ids) ----------
results = run_diagnostics(df, tp_ids, fn_ids, identifier_col="identifier", outdir="tp_fn_diagnostics", n_draw_each=50)


## Hard Negative Mining

In [None]:
import pandas as pd

df_preds = pd.read_csv('/teamspace/studios/this_studio/files/merged_massspec_nist20_with_pfas_fold.tsv', sep='\t')


In [None]:

# --- Step 1: Identify Negatives ---
neg_df = df_preds[df_preds["is_PFAS"] == 0].copy()

print(f"Negatives: {len(neg_df)}")

# --- Step 2: Filter Negatives that contain Fluorine ---
# You can check SMILES string
hard_neg_with_fluorine = neg_df[
    neg_df["smiles"].str.contains("F", case=False, na=False)
].copy()

print(f"Hard Negatives containing Fluorine: {len(hard_neg_with_fluorine)}")

# --- Step 3: Group by training/validation fold ---
summary_by_fold = (
    hard_neg_with_fluorine.groupby("fold")
    .size()
    .reset_index(name="count")
    .sort_values("count", ascending=False)
)

print("\n📊 Hard Negatives with Fluorine by Fold:")
print(summary_by_fold.to_string(index=False))

# --- Step 4: (Optional) Inspect a few examples from each ---
for fold_name, subset in hard_neg_with_fluorine.groupby("fold"):
    print(f"\n=== {fold_name.upper()} SET ===")
    display(subset[["identifier", "formula", "smiles"]].head(10))

# --- Step 5: Save for further analysis ---
hard_neg_with_fluorine.to_csv("hard_negatives_with_fluorine_by_fold.csv", index=False)
print("\n✅ Saved hard negatives with fluorine → hard_negatives_with_fluorine_by_fold.csv")
