In [None]:
##Sage

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

csv_path = "/ajun/DDA_BERT_manu/revision/percentile_percentile_plot/sage/Sage_human_merged_all_score.csv"
out_pdf = "/ajun/DDA_BERT_manu/revision/percentile_percentile_plot/sage/sage_human_score.pdf"

chosen_file = None
chosen_lengthgro = None
score_col = "Score"
type_col = "Type"
file_col = "file_name"
len_col = "LengthGro"

COLOR_DECOY = "#9970ab"
COLOR_TARGET = "#35978f"

df = pd.read_csv(csv_path)

if chosen_file is not None:
    df = df[df[file_col] == chosen_file]
if chosen_lengthgro is not None:
    df = df[df[len_col] == chosen_lengthgro]

df = df[[score_col, type_col]].dropna()
df[type_col] = df[type_col].astype(str).str.strip().str.capitalize()

scores_target = df.loc[df[type_col] == "Target", score_col].to_numpy()
scores_decoy = df.loc[df[type_col] == "Decoy", score_col].to_numpy()

if len(scores_target) == 0 or len(scores_decoy) == 0:
    raise ValueError("Target or Decoy is empty, cannot plot. Please check filters or data.")

fig = plt.figure(figsize=(10, 4))

ax1 = fig.add_subplot(1, 2, 1)
bins = 50
ax1.hist(scores_target, bins=bins, alpha=0.9, label="Target", color=COLOR_TARGET)
ax1.hist(scores_decoy, bins=bins, alpha=0.9, label="Decoy", color=COLOR_DECOY)
ax1.set_xlabel("score")
ax1.set_ylabel("count")
ax1.legend(title="decoy", loc="best")

def ecdf(x: np.ndarray):
    x = np.sort(x)
    n = len(x)

    def F(t):
        return np.searchsorted(x, t, side="right") / n

    return F

F_t = ecdf(scores_target)
F_d = ecdf(scores_decoy)

grid = np.unique(df[score_col].to_numpy())
ftp = np.array([F_t(t) for t in grid])
fdp = np.array([F_d(t) for t in grid])

ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(fdp, ftp, ".", markersize=2, color=COLOR_TARGET)
ax2.plot([0, 1], [0, 1], color=COLOR_DECOY)
ax2.set_xlabel("Fdp")
ax2.set_ylabel("Ftp")
ax2.set_title("PP-plot")

plt.tight_layout()

os.makedirs(os.path.dirname(out_pdf) or ".", exist_ok=True)
fig.savefig(out_pdf, format="pdf", bbox_inches="tight")

print(f"Saved: {out_pdf}")

plt.show()
plt.close(fig)


In [None]:
##FragPipe

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

csv_path = "/ajun/DDA_BERT_manu/revision/percentile_percentile_plot/fp/score/fp_human_merged_all_score.csv"
out_pdf = "/ajun/DDA_BERT_manu/revision/percentile_percentile_plot/fp/score/fp_human_score.pdf"

chosen_file = None
chosen_lengthgro = None
score_col = "Score"
type_col = "Type"
file_col = "file_name"
len_col = "LengthGro"

COLOR_DECOY = "#9970ab"
COLOR_TARGET = "#35978f"

SCORE_MODE = "pep"
EPS = 1e-12

PP_MAX_POINTS = 2000

def transform_score(raw: np.ndarray) -> np.ndarray:
    raw = pd.to_numeric(raw, errors="coerce")
    raw = raw[np.isfinite(raw)]
    if SCORE_MODE == "pep":
        return raw
    if SCORE_MODE == "1-pep":
        return 1.0 - raw
    if SCORE_MODE == "neglog10":
        return -np.log10(raw + EPS)
    raise ValueError(f"Unknown SCORE_MODE: {SCORE_MODE}")

df = pd.read_csv(csv_path)

if chosen_file is not None:
    df = df[df[file_col] == chosen_file]
if chosen_lengthgro is not None:
    df = df[df[len_col] == chosen_lengthgro]

df = df[[score_col, type_col]].dropna()
df[type_col] = df[type_col].astype(str).str.strip().str.capitalize()

raw_target = df.loc[df[type_col] == "Target", score_col].to_numpy()
raw_decoy = df.loc[df[type_col] == "Decoy", score_col].to_numpy()

scores_target = transform_score(raw_target)
scores_decoy = transform_score(raw_decoy)

if len(scores_target) == 0 or len(scores_decoy) == 0:
    raise ValueError("Target or Decoy is empty, cannot plot. Please check filters or data.")

fig = plt.figure(figsize=(10, 4))

ax1 = fig.add_subplot(1, 2, 1)
bins = 50
ax1.hist(scores_target, bins=bins, alpha=0.9, label="Target", color=COLOR_TARGET)
ax1.hist(scores_decoy, bins=bins, alpha=0.9, label="Decoy", color=COLOR_DECOY)
ax1.set_xlabel("score" if SCORE_MODE != "pep" else "PEP")
ax1.set_ylabel("count")
ax1.legend(title="decoy", loc="best")

def ecdf(x: np.ndarray):
    x = np.sort(x)
    n = len(x)

    def F(t):
        return np.searchsorted(x, t, side="right") / n

    return F

F_t = ecdf(scores_target)
F_d = ecdf(scores_decoy)

all_scores = np.concatenate([scores_target, scores_decoy])
all_scores = all_scores[np.isfinite(all_scores)]

unique_cnt = np.unique(all_scores).size
if unique_cnt > PP_MAX_POINTS:
    qs = np.linspace(0.0, 1.0, PP_MAX_POINTS)
    grid = np.unique(np.quantile(all_scores, qs))
else:
    grid = np.unique(all_scores)

ftp = np.array([F_t(t) for t in grid])
fdp = np.array([F_d(t) for t in grid])

ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(fdp, ftp, ".", markersize=2, color=COLOR_TARGET)
ax2.plot([0, 1], [0, 1], color=COLOR_DECOY)
ax2.set_xlabel("Fdp")
ax2.set_ylabel("Ftp")
ax2.set_title(f"PP-plot ({SCORE_MODE})")

plt.tight_layout()

os.makedirs(os.path.dirname(out_pdf) or ".", exist_ok=True)
fig.savefig(out_pdf, format="pdf", bbox_inches="tight")

print(f"SCORE_MODE={SCORE_MODE}")
print(f"PP points: {len(grid)} (cap={PP_MAX_POINTS}, unique_scores={unique_cnt})")
print(f"Saved: {out_pdf}")

plt.show()
plt.close(fig)


In [None]:
##DDA-BERT

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

csv_path = "/ajun/DDA_BERT_manu/revision/percentile_percentile_plot/dda-bert/dda-bert_human_merged.csv"
out_pdf = "/ajun/DDA_BERT_manu/revision/percentile_percentile_plot/dda-bert/dda-bert_human_score_broken_y.pdf"

chosen_file = None
chosen_lengthgro = None

score_col = "Score"
type_col = "Type"
file_col = "file_name"

COLOR_DECOY = "#9970ab"
COLOR_TARGET = "#35978f"
COLOR_MISC = COLOR_TARGET

len_col_candidates = ["LengthGro", "LengthGroup"]
len_col = next((c for c in len_col_candidates if c in pd.read_csv(csv_path, nrows=1).columns), None)

df = pd.read_csv(csv_path)

if chosen_file is not None:
    df = df[df[file_col] == chosen_file]

if chosen_lengthgro is not None:
    if len_col is None:
        raise KeyError("No LengthGro/LengthGroup column found, but chosen_lengthgro is set.")
    df = df[df[len_col] == chosen_lengthgro]

df = df[[score_col, type_col]].dropna()
df[type_col] = df[type_col].astype(str).str.strip().str.capitalize()

scores_target = pd.to_numeric(df.loc[df[type_col] == "Target", score_col], errors="coerce").dropna().to_numpy()
scores_decoy = pd.to_numeric(df.loc[df[type_col] == "Decoy", score_col], errors="coerce").dropna().to_numpy()

if len(scores_target) == 0 or len(scores_decoy) == 0:
    raise ValueError("Target or Decoy is empty, cannot plot. Please check filters or data.")

bins = 50
t_hist, edges = np.histogram(scores_target, bins=bins)
d_hist, _ = np.histogram(scores_decoy, bins=edges)
max_bin = int(max(t_hist.max(), d_hist.max()))

low_max = max(1, int(max_bin * 0.25))
high_min = max(low_max + 1, int(max_bin * 0.30))

fig = plt.figure(figsize=(10, 4))

ax1_top = fig.add_subplot(2, 2, 1)
ax1_bot = fig.add_subplot(2, 2, 3, sharex=ax1_top)

for ax in (ax1_top, ax1_bot):
    ax.hist(scores_decoy, bins=bins, alpha=0.6, label="Decoy", color=COLOR_DECOY)
    ax.hist(
        scores_target,
        bins=bins,
        histtype="step",
        linewidth=1.8,
        label="Target",
        color=COLOR_TARGET,
    )

ax1_bot.set_ylim(0, low_max)
ax1_top.set_ylim(high_min, max(ax1_top.get_ylim()[1], high_min + 1))

ax1_top.spines["bottom"].set_visible(False)
ax1_bot.spines["top"].set_visible(False)
ax1_top.tick_params(labelbottom=False)

d = 0.008
kwargs = dict(transform=ax1_top.transAxes, color=COLOR_MISC, clip_on=False)
ax1_top.plot((-d, +d), (-d, +d), **kwargs)
ax1_top.plot((1 - d, 1 + d), (-d, +d), **kwargs)

kwargs.update(transform=ax1_bot.transAxes)
ax1_bot.plot((-d, +d), (1 - d, 1 + d), **kwargs)
ax1_bot.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)

ax1_bot.set_xlabel("score")
ax1_bot.set_ylabel("count")
ax1_top.legend(title="decoy", loc="best")

ax2 = fig.add_subplot(1, 2, 2)

def ecdf(x: np.ndarray):
    x = np.sort(x)
    n = len(x)

    def F(t):
        return np.searchsorted(x, t, side="right") / n

    return F

F_t = ecdf(scores_target)
F_d = ecdf(scores_decoy)

grid = np.unique(pd.to_numeric(df[score_col], errors="coerce").dropna().to_numpy())
ftp = np.array([F_t(t) for t in grid])
fdp = np.array([F_d(t) for t in grid])

ax2.plot(fdp, ftp, ".", markersize=2, color=COLOR_TARGET)
ax2.plot([0, 1], [0, 1], color=COLOR_DECOY)

ax2.set_xlabel("Fdp")
ax2.set_ylabel("Ftp")
ax2.set_title("PP-plot")

plt.tight_layout()

os.makedirs(os.path.dirname(out_pdf) or ".", exist_ok=True)
fig.savefig(out_pdf, format="pdf", bbox_inches="tight")

print(f"Target max bin={t_hist.max()}, Decoy max bin={d_hist.max()}")
print(f"Auto broken-y: low_max={low_max}, high_min={high_min}")
print(f"Colors: decoy={COLOR_DECOY}, target={COLOR_TARGET}")
print(f"Saved: {out_pdf}")

plt.show()
plt.close(fig)
