# Analysis of aggregated search results and plot creation

In [54]:
import pandas as pd
from pyopenms import MzMLFile, MSExperiment, OnDiscMSExperiment
import numpy as np
import re
import matplotlib.pyplot as plt

In [2]:
mod_acetyl = ("Acetyl\nK","K(UniMod:1)")
mod_formyl = ("Formyl\nK","K(UniMod:122)")
mod_methyl = ("Methyl\nK","K(UniMod:34)")
mod_carbamidomethyl = ("Carbami.\nK","K(UniMod:4)")
mod_malonyl = ("Malonyl\nK","K(UniMod:747)")

mod_oxid = ("Oxid\nP","P(UniMod:35)")
mod_oxid_m = ("Oxid\nM","M(UniMod:35)")
mod_nitro = ("Nitro\nY","Y(UniMod:354)")
mod_phospho = ("Phospho\nY","Y(UniMod:21)")

mod_biotin = ("Biotin\nK","K(UniMod:3)")
mod_dimethyl = ("Dimethyl\nK","K(UniMod:36)")

In [3]:
def fix_decoy_report(df):
    df.loc[:, "Q.Value"] = pd.to_numeric(df["Precursor.Id"])
    df.loc[:, "Precursor.Id"] = df["Modified.Sequence"]
    df.loc[:, "CScore"] = df["RT.Start"]

In [None]:
report_aggregated_single = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_all_mods_with_decoys/report_aggregated_all_targets_with_decoys.csv")
report_filtered_single = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_all_mods_with_decoys/report_aggregated_fdr_filtered.csv")

report_aggregated_comb = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_all_mods_with_combinations_aggregated/report_aggregated_all_targets_with_decoys.csv")
report_filtered_comb = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_all_mods_with_combinations_aggregated/report_aggregated_fdr_filtered.csv")

report_aggregated_moxid = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_all_mods_combinations_m_oxid/report_aggregated_all_targets_with_decoys.csv")
report_filtered_moxid = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_all_mods_combinations_m_oxid/report_aggregated_fdr_filtered.csv")

report_aggregated_automatic_500 = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_automatic_mod_selection_limit_500/report_aggregated_all_targets_with_decoys.csv")
report_filtered_automatic_500 = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_automatic_mod_selection_limit_500/report_aggregated_fdr_filtered.csv")

report_aggregated_automatic = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_automatic_mod_selection/report_aggregated_all_targets_with_decoys.csv")
report_filtered_automatic = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_automatic_mod_selection/report_aggregated_fdr_filtered.csv")

report_aggregated_comb_normalized = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_all_mods_combinations_normalized_cscore/report_aggregated_all_targets_with_decoys.csv")
report_filtered_comb_normalized = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_all_mods_combinations_normalized_cscore/report_aggregated_fdr_filtered.csv")

report_aggregated_automatic_normalized = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_automatic_mod_selection_normalized_cscore/report_aggregated_all_targets_with_decoys.csv")
report_filtered_automatic_normalized = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_automatic_mod_selection_normalized_cscore/report_aggregated_fdr_filtered.csv")

## C-score and FDR analysis

In [None]:
print(report_aggregated_single[report_aggregated_single["q_value_aggregated"] <= 0.01]["CScore"].min())
print(report_aggregated_comb[report_aggregated_comb["q_value_aggregated"] <= 0.01]["CScore"].min())
print(report_aggregated_moxid[report_aggregated_moxid["q_value_aggregated"] <= 0.01]["CScore"].min())
print(report_aggregated_automatic_500[report_aggregated_automatic_500["q_value_aggregated"] <= 0.01]["CScore"].min())
print(report_aggregated_automatic[report_aggregated_automatic["q_value_aggregated"] <= 0.01]["CScore"].min())
print(report_aggregated_comb_normalized[report_aggregated_comb_normalized["q_value_aggregated"] <= 0.01]["CScore_normalized"].min())
print(report_aggregated_automatic_normalized[report_aggregated_automatic_normalized["q_value_aggregated"] <= 0.01]["CScore_normalized"].min())

In [None]:
np.unique(report_aggregated_automatic_500[np.logical_and(report_aggregated_automatic_500["CScore"] > 1, report_aggregated_automatic_500["Run"].str.contains("lower_energy_windows"))]["Run"].to_numpy(), return_counts=True)

In [10]:
def plot_cscore_vs_fdr(report, cscore_column="CScore", fig_name = None):
    passing_mask = report["q_value_aggregated"] <= 0.01
    plt.scatter(report[~passing_mask][cscore_column], report[~passing_mask]["q_value_aggregated"], color="blue", label="FDR > 0.01")
    plt.scatter(report[passing_mask][cscore_column], report[passing_mask]["q_value_aggregated"], color="orange", label="FDR <= 0.01")
    plt.legend()
    plt.xlabel("C-score")
    plt.ylabel("FDR")
    if fig_name is not None:
        plt.savefig(f"../../data/plots/Cscore_vs_fdr_mod_dataset_{fig_name}.png", bbox_inches="tight")
    plt.show()

In [None]:
plot_cscore_vs_fdr(report_aggregated_single, fig_name="single")
plot_cscore_vs_fdr(report_aggregated_comb, fig_name="combinations")
plot_cscore_vs_fdr(report_aggregated_moxid, fig_name="m_oxid")
plot_cscore_vs_fdr(report_aggregated_automatic_500, fig_name="automatic_limit_500")
plot_cscore_vs_fdr(report_aggregated_automatic, fig_name="automatic")
plot_cscore_vs_fdr(report_aggregated_comb_normalized, cscore_column="CScore_normalized", fig_name="combinations_normalized")
plot_cscore_vs_fdr(report_aggregated_automatic_normalized, cscore_column="CScore_normalized", fig_name="automatic_limit_500_normalized")

In [None]:
print(len(report_filtered_single), report_filtered_single["Modified.Sequence"].str.contains("UniMod").sum())
print(len(report_filtered_comb), report_filtered_comb["Modified.Sequence"].str.contains("UniMod").sum())
print(len(report_filtered_moxid), report_filtered_moxid["Modified.Sequence"].str.contains("UniMod").sum())
print(len(report_filtered_automatic_500), report_filtered_automatic_500["Modified.Sequence"].str.contains("UniMod").sum())
print(len(report_filtered_automatic), report_filtered_automatic["Modified.Sequence"].str.contains("UniMod").sum())
print(len(report_filtered_comb_normalized), report_filtered_comb_normalized["Modified.Sequence"].str.contains("UniMod").sum())
print(len(report_filtered_automatic_normalized), report_filtered_automatic_normalized["Modified.Sequence"].str.contains("UniMod").sum())

## Results on dataset without PTMs

In [25]:
report_filtered_unmod_single = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_unmod/report_aggregated_fdr_filtered.csv")
report_filtered_unmod_combinations = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_unmod_combinations/report_aggregated_fdr_filtered.csv")
report_filtered_unmod_automatic = pd.read_csv("../../data/workflow_test_tolerance_snr3_10ppm_automatic_unmod/report_aggregated_fdr_filtered.csv")

In [23]:
def plot_mods_result_unmod_dataset(unmod_targets, mods, fig_name):
    mod_names = []
    nums_unmod_dataset_precursors = []

    for mod_name, mod_unimod in mods:
        mod_names.append(mod_name)
        nums_unmod_dataset_precursors.append(unmod_targets["Modified.Sequence"].str.contains(re.escape(mod_unimod)).sum())

    x = np.arange(len(mod_names))
    width = 0.25

    fig, ax = plt.subplots()

    ax.bar_label(ax.bar(x , nums_unmod_dataset_precursors, width, label="Dataset without mods"), padding=3)

    ax.set_ylabel('Number of found precursors with modifications')
    ax.set_xticks(x + width/2, mod_names)

    plt.savefig(f"../../data/plots/{fig_name}_unmod.svg", bbox_inches="tight")
    plt.show()


In [None]:
plot_mods_result_unmod_dataset(
    report_filtered_unmod_single,
    [
        mod_acetyl,
        mod_biotin,
        mod_dimethyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho,
    ],
    "all_mods_single",
)

plot_mods_result_unmod_dataset(
    report_filtered_unmod_combinations,
    [
        mod_acetyl,
        mod_biotin,
        mod_dimethyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho,
    ],
    "all_mods_together",
)
plot_mods_result_unmod_dataset(
    report_filtered_unmod_automatic,
    [
        mod_acetyl,
        mod_biotin,
        mod_carbamidomethyl,
        mod_formyl,
        mod_malonyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho,
    ],
    "automatic",
)

## Comparison to baseline on dataset with PTMs

In [55]:
baseline_single = pd.read_csv("../../data/diann_results/dia_nn_results_filtered/report_filtered_5mods_all_spectra.tsv", delimiter="\t")
baseline_comb = pd.read_csv("../../data/diann_results/dia_nn_results_filtered/report_filtered_5mods_together_all_spectra.tsv", delimiter="\t")
baseline_m_oxid = pd.read_csv("../../data/diann_results/dia_nn_results_filtered/report_filtered_5mods_together_m_oxid_all_spectra.tsv", delimiter="\t")

In [19]:
def get_num_single_mod_and_combinations(df_for_mod, mod_unimod):
    assert df_for_mod["Modified.Sequence"].str.contains(re.escape(mod_unimod)).sum() == len(df_for_mod)
    unimod_regex = re.compile(r".\(UniMod:[0-9]+\)")
    num_single = 0
    num_combination = 0
    for sequence in df_for_mod["Modified.Sequence"]:
        mods = np.unique(re.findall(unimod_regex, sequence))
        if len(mods) == 1:
            num_single += 1
        else:
            num_combination += 1
    return num_single, num_combination


In [31]:
def plot_all_spectra_vs_subset_combination(
    mod_targets_subset, targets_baseline, mods, fig_name):
    mod_names = []
    nums_subset_mod_precursors_single = []
    nums_subset_mod_precursors_combination = []
    nums_all_mod_precursors_single = []
    nums_all_mod_precursors_combination = []
    percentage_label = "percentage_single_occurence"

    df = pd.DataFrame(columns=[mod_name.replace("\n", "-") for mod_name, _ in mods],
                      index=[("baseline",percentage_label), ("split_search",percentage_label)])

    for mod_name, mod_unimod in mods:
        mod_names.append(mod_name)
        num_subset_mod_precursors_single, num_subset_mod_precursors_combination = (
            get_num_single_mod_and_combinations(
                mod_targets_subset[
                        mod_targets_subset["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                        )
                ],
                mod_unimod,
            )
        )
        nums_subset_mod_precursors_single.append(num_subset_mod_precursors_single)
        nums_subset_mod_precursors_combination.append(num_subset_mod_precursors_combination)

        num_all_mod_precursors_single, num_all_mod_precursors_combination = (
            get_num_single_mod_and_combinations(
                targets_baseline[
                    np.logical_and(
                        targets_baseline["Q.Value"] <= 0.01,
                        targets_baseline["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                        ),
                    )
                ],
                mod_unimod,
            )
        )
        nums_all_mod_precursors_single.append(num_all_mod_precursors_single)
        nums_all_mod_precursors_combination.append(num_all_mod_precursors_combination)
        df.at[("split_search", percentage_label), mod_name.replace("\n", "-")] = num_subset_mod_precursors_single / (num_subset_mod_precursors_single + num_subset_mod_precursors_combination) if num_subset_mod_precursors_single > 0 else "-"
        df.at[("baseline", percentage_label), mod_name.replace("\n", "-")] = num_all_mod_precursors_single / (num_all_mod_precursors_single + num_all_mod_precursors_combination) if num_all_mod_precursors_single > 0 else "-"

    x = np.arange(len(mod_names))
    width = 0.25

    fig, ax = plt.subplots()

    def print_nonzero_labels(labels):
        labels = np.array(labels).astype(str)
        labels[labels == "0"] = ""
        return labels

    ax.bar_label(
        ax.bar(x, nums_all_mod_precursors_single, width, label="Baseline - occurs alone", color="#00004B"),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x,
            nums_all_mod_precursors_combination,
            width,
            bottom=nums_all_mod_precursors_single,
            label="Baseline - occurs in comb.", color="#B0C0FF"
        ),
        padding=3, labels=print_nonzero_labels(nums_all_mod_precursors_combination)
    )

    ax.bar_label(
        ax.bar(
            x + width+0.05,
            nums_subset_mod_precursors_single,
            width,
            label="Split search - occurs alone",
            color="#9F3400"
        ),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x + width+0.05,
            nums_subset_mod_precursors_combination,
            width,
            bottom=nums_subset_mod_precursors_single,
            label="Split search - occurs in comb.",
            color="#FF935F"
        ), labels=print_nonzero_labels(nums_subset_mod_precursors_combination),
        padding=3,
    )

    ax.set_ylabel("Number of found precursors with modifications")
    ax.set_xticks(x + width / 2, mod_names)
    ax.legend(loc="upper right")

    df.to_csv(f"../../data/plots/{fig_name}_vs_all_spectra.csv")
    plt.savefig(f"../../data/plots/{fig_name}_vs_all_spectra.svg", bbox_inches="tight")
    plt.show()

In [None]:
plot_all_spectra_vs_subset_combination(report_filtered_comb, baseline_comb, [
        mod_acetyl,
        mod_biotin,
        mod_dimethyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho,
    ], "all_mods_together")
plot_all_spectra_vs_subset_combination(report_filtered_moxid, baseline_m_oxid, [
        mod_acetyl,
        mod_biotin,
        mod_dimethyl,
        mod_methyl,
        mod_oxid,
        mod_oxid_m,
        mod_nitro,
        mod_phospho,
    ], "all_mods_together_m_oxid")

In [33]:
def get_num_synthetic_and_human(df_for_mod, mod_unimod):
    assert df_for_mod["Modified.Sequence"].str.contains(re.escape(mod_unimod)).sum() == len(df_for_mod)
    num_synthetic = (df_for_mod["Protein.Ids"].str.contains("X0000")).sum()
    num_human = (~df_for_mod["Protein.Ids"].str.contains("X0000")).sum()

    return num_synthetic, num_human

In [56]:
def plot_all_spectra_vs_subset_synthetic(
    mod_targets_subset, targets_all_spectra, fig_name
):
    mod_names = []
    nums_subset_mod_precursors_synthetic = []
    nums_subset_mod_precursors_human = []
    nums_all_mod_precursors_synthetic = []
    nums_all_mod_precursors_human = []

    mods_list = [
        mod_acetyl,
        mod_biotin,
        mod_dimethyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho,
    ]
    percentage_label = "percentage_synthetic"

    df = pd.DataFrame(columns=[mod_name.replace("\n", "-") for mod_name, _ in mods_list],
                      index=[("baseline",percentage_label), ("split_search",percentage_label)])

    for mod_name, mod_unimod in mods_list:
        mod_names.append(mod_name)
        num_subset_mod_precursors_synthetic, num_subset_mod_precursors_human = (
            get_num_synthetic_and_human(
                mod_targets_subset[
                        mod_targets_subset["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                        )
                ],
                mod_unimod,
            )
        )
        nums_subset_mod_precursors_synthetic.append(num_subset_mod_precursors_synthetic)
        nums_subset_mod_precursors_human.append(num_subset_mod_precursors_human)

        num_all_mod_precursors_synthetic, num_all_mod_precursors_human = (
            get_num_synthetic_and_human(
                targets_all_spectra[
                    np.logical_and(
                        targets_all_spectra["Q.Value"] < 0.01,
                        targets_all_spectra["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                        ),
                    )
                ],
                mod_unimod,
            )
        )
        nums_all_mod_precursors_synthetic.append(num_all_mod_precursors_synthetic)
        nums_all_mod_precursors_human.append(num_all_mod_precursors_human)

        df.at[("split_search", percentage_label), mod_name.replace("\n", "-")] = num_subset_mod_precursors_synthetic / (num_subset_mod_precursors_synthetic + num_subset_mod_precursors_human)
        df.at[("baseline", percentage_label), mod_name.replace("\n", "-")] = num_all_mod_precursors_synthetic / (num_all_mod_precursors_synthetic + num_all_mod_precursors_human) if num_all_mod_precursors_synthetic > 0 else "-"

    x = np.arange(len(mod_names))
    width = 0.25

    fig, ax = plt.subplots()

    def print_nonzero_labels(labels):
        labels = np.array(labels).astype(str)
        labels[labels == "0"] = ""
        return labels

    ax.bar_label(
        ax.bar(x, nums_all_mod_precursors_synthetic, width, label="Baseline - synthetic", color="#00004B"),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x,
            nums_all_mod_precursors_human,
            width,
            bottom=nums_all_mod_precursors_synthetic,
            label="Baseline - human", color="#B0C0FF"
        ),
        padding=3, labels=print_nonzero_labels(nums_all_mod_precursors_human)
    )

    ax.bar_label(
        ax.bar(
            x + width+0.05,
            nums_subset_mod_precursors_synthetic,
            width,
            label="Split search - synthetic",
            color="#9F3400"
        ),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x + width+0.05,
            nums_subset_mod_precursors_human,
            width,
            bottom=nums_subset_mod_precursors_synthetic,
            label="Split search - human",
            color="#FF935F"
        ), labels=print_nonzero_labels(nums_subset_mod_precursors_human),
        padding=3,
    )

    ax.set_ylabel("Number of found precursors with modifications")
    ax.set_xticks(x + width / 2, mod_names)
    ax.legend(loc="upper right")

    df.to_csv(f"../../data/plots/{fig_name}_vs_all_spectra_synthetic_human.csv")
    plt.savefig(f"../../data/plots/{fig_name}_vs_all_spectra_legend_synthetic_human.svg", bbox_inches="tight")
    plt.show()

In [None]:
plot_all_spectra_vs_subset_synthetic(report_filtered_single, baseline_single, "all_mods_single")
plot_all_spectra_vs_subset_synthetic(report_filtered_comb, baseline_comb, "all_mods_together") 

In [36]:
def plot_all_spectra_vs_subset_combination_synthetic(
    mod_targets_subset, targets_all_spectra
):
    mod_names = []
    nums_subset_mod_precursors_single = []
    nums_subset_mod_precursors_combination = []
    nums_all_mod_precursors_single = []
    nums_all_mod_precursors_combination = []
    mods_list = [
        mod_acetyl,
        mod_biotin,
        mod_dimethyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho,
    ]

    for mod_name, mod_unimod in mods_list:
        mod_names.append(mod_name)
        num_subset_mod_precursors_single, num_subset_mod_precursors_combination = (
            get_num_single_mod_and_combinations(
                mod_targets_subset[
                    np.logical_and(
                        mod_targets_subset["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                        ),
                        mod_targets_subset["Protein.Ids"].str.contains("X0000")
                    )
                ],
                mod_unimod,
            )
        )
        nums_subset_mod_precursors_single.append(num_subset_mod_precursors_single)
        nums_subset_mod_precursors_combination.append(num_subset_mod_precursors_combination)

        num_all_mod_precursors_single, num_all_mod_precursors_combination = (
            get_num_single_mod_and_combinations(
                targets_all_spectra[
                    np.logical_and(np.logical_and(
                        targets_all_spectra["Q.Value"] < 0.01,
                        targets_all_spectra["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                        )),
                        targets_all_spectra["Protein.Ids"].str.contains("X0000")
                    )
                ],
                mod_unimod,
            )
        )
        nums_all_mod_precursors_single.append(num_all_mod_precursors_single)
        nums_all_mod_precursors_combination.append(num_all_mod_precursors_combination)

    x = np.arange(len(mod_names))
    width = 0.25

    fig, ax = plt.subplots()

    def print_nonzero_labels(labels):
        labels = np.array(labels).astype(str)
        labels[labels == "0"] = ""
        return labels

    ax.bar_label(
        ax.bar(x, nums_all_mod_precursors_single, width, label="Baseline synthetic - occurs alone", color="#00004B"),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x,
            nums_all_mod_precursors_combination,
            width,
            bottom=nums_all_mod_precursors_single,
            label="Baseline synthetic - occurs in comb.", color="#B0C0FF"
        ),
        padding=3, labels=print_nonzero_labels(nums_all_mod_precursors_combination)
    )

    ax.bar_label(
        ax.bar(
            x + width+0.05,
            nums_subset_mod_precursors_single,
            width,
            label="Split search synthetic - occurs alone",
            color="#9F3400"
        ),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x + width+0.05,
            nums_subset_mod_precursors_combination,
            width,
            bottom=nums_subset_mod_precursors_single,
            label="Split search synthetic- occurs in comb.",
            color="#FF935F"
        ), labels=print_nonzero_labels(nums_subset_mod_precursors_combination),
        padding=3,
    )

    ax.set_ylabel("Number of found precursors with modifications")
    ax.set_xticks(x + width / 2, mod_names)
    ax.legend(loc="upper right")
    ax.set_ylim(0, 1010)

    plt.savefig("../../data/plots/all_mods_together_vs_all_spectra_synthetic_combination_legend.svg", bbox_inches="tight")
    plt.show()

In [None]:
plot_all_spectra_vs_subset_combination_synthetic(report_filtered_comb, baseline_comb)

## Results of the run with automatic PTM selection on the dataset without PTMs

In [38]:
def plot_subset(
    mod_targets_subset
):
    mod_names = []
    nums_subset_mod_precursors_single = []
    nums_subset_mod_precursors_combination = []

    mods_list = [
        mod_acetyl,
        mod_biotin,
        mod_carbamidomethyl,
        mod_formyl,
        mod_malonyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho
    ]

    df = pd.DataFrame(columns=[mod_name.replace("\n", "-") for mod_name, _ in mods_list], index=["percentage_single_occurence"])

    for mod_name, mod_unimod in mods_list:
        mod_names.append(mod_name)
        num_subset_mod_precursors_single, num_subset_mod_precursors_combination = (
            get_num_single_mod_and_combinations(
                mod_targets_subset[
                        mod_targets_subset["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                        )
                ],
                mod_unimod,
            )
        )
        nums_subset_mod_precursors_single.append(num_subset_mod_precursors_single)
        nums_subset_mod_precursors_combination.append(num_subset_mod_precursors_combination)
        df.at["percentage_single_occurence", mod_name.replace("\n", "-")] = num_subset_mod_precursors_single / (num_subset_mod_precursors_single + num_subset_mod_precursors_combination) if num_subset_mod_precursors_single > 0 else "-"

    x = np.arange(len(mod_names))
    width = 0.25

    fig, ax = plt.subplots()

    def print_nonzero_labels(labels):
        labels = np.array(labels).astype(str)
        labels[labels == "0"] = ""
        return labels

    ax.bar_label(
        ax.bar(
            x + width/2,
            nums_subset_mod_precursors_single,
            width,
            label="Occurs alone",
            color="#9F3400"
        ),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x+ width/2,
            nums_subset_mod_precursors_combination,
            width,
            bottom=nums_subset_mod_precursors_single,
            label="Occurs in comb.",
            color="#FF935F"
        ), labels=print_nonzero_labels(nums_subset_mod_precursors_combination),
        padding=3,
    )

    ax.set_ylabel("Number of found precursors with modifications")
    ax.set_xticks(x + width / 2, mod_names)
    ax.legend(loc="upper right")

    df.to_csv("../../data/plots/automatic_mod_selection_limit_500.csv")
    plt.savefig(f"../../data/plots/automatic_mod_selection_limit_500.svg", bbox_inches="tight")
    plt.show()

In [None]:
plot_subset(report_filtered_automatic)

In [51]:
def plot_subset_synthetic(
    mod_targets_subset
):
    mod_names = []
    nums_subset_mod_precursors_synthetic = []
    nums_subset_mod_precursors_human = []

    mods_list = [
        mod_acetyl,
        mod_biotin,
        mod_carbamidomethyl,
        mod_formyl,
        mod_malonyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho
    ]

    df = pd.DataFrame(columns=[mod_name.replace("\n", "-") for mod_name, _ in mods_list], index=["percentage_synthetic"])

    for mod_name, mod_unimod in [
        mod_acetyl,
        mod_biotin,
        mod_carbamidomethyl,
        mod_formyl,
        mod_malonyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho
    ]:
        mod_names.append(mod_name)
        num_subset_mod_precursors_synthetic, num_subset_mod_precursors_human = (
            get_num_synthetic_and_human(
                mod_targets_subset[
                        mod_targets_subset["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                    )
                ],
                mod_unimod,
            )
        )
        nums_subset_mod_precursors_synthetic.append(num_subset_mod_precursors_synthetic)
        nums_subset_mod_precursors_human.append(num_subset_mod_precursors_human)

        df.at["percentage_synthetic", mod_name.replace("\n", "-")] = num_subset_mod_precursors_synthetic / (num_subset_mod_precursors_synthetic + num_subset_mod_precursors_human)


    x = np.arange(len(mod_names))
    width = 0.25

    fig, ax = plt.subplots()

    def print_nonzero_labels(labels):
        labels = np.array(labels).astype(str)
        labels[labels == "0"] = ""
        return labels

    ax.bar_label(
        ax.bar(
            x + width/2,
            nums_subset_mod_precursors_synthetic,
            width,
            label="Synthetic",
            color="#9F3400"
        ),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x+ width/2,
            nums_subset_mod_precursors_human,
            width,
            bottom=nums_subset_mod_precursors_synthetic,
            label="Human",
            color="#FF935F"
        ), labels=print_nonzero_labels(nums_subset_mod_precursors_human),
        padding=3,
    )

    ax.set_ylabel("Number of found precursors with modifications")
    ax.set_xticks(x + width / 2, mod_names)
    ax.legend(loc="upper right")

    df.to_csv("../../data/plots/automatic_mod_selection_synthetic.csv")
    plt.savefig(f"../../data/plots/automatic_mod_selection_synthetic.svg", bbox_inches="tight")
    plt.show()

In [None]:
plot_subset_synthetic(report_filtered_automatic)

In [44]:
def plot_subset_synthetic_and_combination(
    mod_targets_subset
):
    mod_names = []
    nums_subset_mod_precursors_single = []
    nums_subset_mod_precursors_combination = []

    for mod_name, mod_unimod in [
        mod_acetyl,
        mod_biotin,
        mod_carbamidomethyl,
        mod_formyl,
        mod_malonyl,
        mod_methyl,
        mod_oxid,
        mod_nitro,
        mod_phospho
    ]:
        mod_names.append(mod_name)
        num_subset_mod_precursors_single, num_subset_mod_precursors_combination = (
            get_num_single_mod_and_combinations(
                mod_targets_subset[
                    np.logical_and(
                        mod_targets_subset["Modified.Sequence"].str.contains(
                            re.escape(mod_unimod)
                        ),
                        mod_targets_subset["Protein.Ids"].str.contains("X0000")
                    )
                ],
                mod_unimod,
            )
        )
        nums_subset_mod_precursors_single.append(num_subset_mod_precursors_single)
        nums_subset_mod_precursors_combination.append(num_subset_mod_precursors_combination)

    x = np.arange(len(mod_names))
    width = 0.25

    fig, ax = plt.subplots()

    def print_nonzero_labels(labels):
        labels = np.array(labels).astype(str)
        labels[labels == "0"] = ""
        return labels

    ax.bar_label(
        ax.bar(
            x + width/2,
            nums_subset_mod_precursors_single,
            width,
            label="Synthetic\n- occurs alone",
            color="#9F3400"
        ),
        padding=3,
    )
    ax.bar_label(
        ax.bar(
            x+ width/2,
            nums_subset_mod_precursors_combination,
            width,
            bottom=nums_subset_mod_precursors_single,
            label="Synthetic\n- occurs in comb.",
            color="#FF935F"
        ), labels=print_nonzero_labels(nums_subset_mod_precursors_combination),
        padding=3,
    )

    ax.set_ylabel("Number of found precursors with modifications")
    ax.set_xticks(x + width / 2, mod_names)
    ax.legend(loc="upper right")
    ax.set_ylim(0,820)

    plt.savefig(f"../../data/plots/automatic_mod_selection_synthetic_combination.svg", bbox_inches="tight")
    plt.show()

In [None]:
plot_subset_synthetic_and_combination(report_filtered_automatic)