# Immonium ion detection - PTM combinations
Evaluation regarding immonium ions of different PTMs occurring together in an immonium ion MS2 window.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import networkx as nx

In [3]:
mods_abbrev_dict = {
    "Lysine,Acetyl": "K-acetyl",
    "Lysine,Formyl": "K-formyl",
    "Lysine,Biotin": "K-biotin",
    "Lysine,Methyl": "K-methyl",
    "Lysine,Carbamidomethyl": "K-carbami",
    "Lysine,Malonyl": "K-malonyl",
    "Lysine,Diethyl": "K-diethyl",
    "Lysine,Dimethyl": "K-dimethyl",
    "Proline,Oxidation": "P-oxid",
    "Tyrosine,Nitro": "Y-nitro",
    "Tyrosine,Phospho": "Y-phospho",
    "Arginine,Deamidated": "R-deamid",
    "Cysteine,Carbamidomethyl": "C-carbami",
    "Histidine,Carbamidomethyl": "H-carbami",
    "Tryptophan,Methyl": "W-methyl"   
}

In [None]:
mods_df = pd.read_csv("../../data/result_csvs/230928_JL_Immonium_ions_Modified_DIA.mzML_diagnostic_ions_ppm_tolerance_10_snr_threshold_3_unimod.csv")

In [11]:
def get_mods_df_by_scan_window(result_df):
    mods_df = result_df[["spectrum_id", "amino_acid", "mod_name"]].drop_duplicates()
    return mods_df.groupby("spectrum_id")

In [6]:
def get_mods_combinations_with_counts(
    detected_ions_df: pd.DataFrame, return_unimod=False
):
    mod_columns = (
        ["letter_and_unimod_format_mod"]
        if return_unimod
        else ["amino_acid", "mod_name"]
    )
    detected_ions_df_by_window = (
        detected_ions_df[["spectrum_id"] + mod_columns]
        .drop_duplicates()
        .groupby("spectrum_id")
    )

    mods = []
    for _, group in detected_ions_df_by_window:
        mods.append(
            [",".join(mod) for mod in (group[mod_columns].drop_duplicates().to_numpy())]
        )

    return np.unique(np.array(mods, dtype="object"), return_counts=True)

In [7]:
def get_mods_combinations_with_counts_subset(result_df, wanted_mods):
    mods_df_by_scan_window = get_mods_df_by_scan_window(result_df)

    mods = []
    for name, group in mods_df_by_scan_window:
        mods_in_group = [",".join(mod) for mod in group[["amino_acid", "mod_name"]].drop_duplicates().to_numpy() if ",".join(mod) in wanted_mods]
        if len(mods_in_group) > 0:
            mods.append(mods_in_group)

    return np.unique(np.array(mods, dtype='object'), return_counts=True)

In [8]:
def get_num_mods_per_scan_window(result_df):
    return get_mods_df_by_scan_window(result_df).count().rename(columns={"amino_acid": "count"})["count"]
    

In [9]:
def get_windows_single_multiple_mods(result_df):
    num_mods_df_by_scan_window = get_num_mods_per_scan_window(result_df)
    windows_multiple_mods = num_mods_df_by_scan_window[num_mods_df_by_scan_window != 1]
    windows_single_mods = num_mods_df_by_scan_window[num_mods_df_by_scan_window == 1]
    return windows_single_mods, windows_multiple_mods

In [None]:
len(get_mods_df_by_scan_window(mods_df))

In [13]:
# All detected combinations and counts for all detected immonium ions
all_mod_combinations, combination_counts = get_mods_combinations_with_counts(mods_df, return_unimod=False)

In [None]:
all_mod_combinations

In [14]:
# Combinations and counts only considering immonium ions of those 8 PTMs
all_mod_combinations_wanted, combination_counts_wanted = get_mods_combinations_with_counts_subset(
    mods_df, ['Lysine,Acetyl', 'Lysine,Biotin', 'Lysine,Dimethyl', 'Lysine,Methyl', 
              'Lysine,Trimethyl', 'Proline,Oxidation', 'Tyrosine,Nitro', 'Tyrosine,Phospho'])

In [None]:
all_mod_combinations_wanted

In [52]:
def plot_combination_counts(combinations, counts, fig_name=None, plot_fractions=None, topk=None, figsize=(5, 15), xlim=None, line_label_pos_fraction = 3):
    count_sort = np.argsort(counts)
    all_mod_combinations_sorted = combinations[count_sort]
    combination_counts_sorted = counts[count_sort]
    total_count = combination_counts_sorted.sum()

    if topk is not None:
        all_mod_combinations_sorted = all_mod_combinations_sorted[-topk:]
        combination_counts_sorted = combination_counts_sorted[-topk:]

    fig, ax = plt.subplots(figsize=figsize)
    combination_names = [(r" $\bf{|}$ ").join([mods_abbrev_dict[mod] for mod in combination]) for combination in all_mod_combinations_sorted]
    num_combinations = len(combination_names)
    ax.bar_label(ax.barh(range(num_combinations), combination_counts_sorted), padding=3)
    ax.set_yticks(range(num_combinations), labels=combination_names)
    ax.set_ylim(-1, len(combination_names))
    ax.set_xlabel("Number of windows")
    if xlim is not None:
        ax.set_xlim(xlim)
    ax.invert_yaxis()

    if plot_fractions is not None:
        combination_counts_sorted_reverse = combination_counts_sorted[::-1]
        max_count = combination_counts_sorted_reverse.max()
        color_map = cm.get_cmap("Dark2")

        for j, fraction in enumerate(plot_fractions):
            total_count_fraction = total_count * fraction
            print(total_count)
            print(total_count_fraction)
            total_count_current = 0

            for i, count in enumerate(combination_counts_sorted_reverse):
                total_count_current += count
                if total_count_current >= total_count_fraction:
                    print(i, count)
                    ax.axhline(num_combinations - (i + 1.5), label=fraction, color=color_map(j))
                    ax.text(max_count/line_label_pos_fraction,num_combinations - (i + 1.75), f"Count percentile\n   >= {fraction}", color=color_map(j))
                    break    

    if fig_name is not None:
        plt.savefig(f"../../data/plots/{fig_name}", bbox_inches="tight")

In [None]:
plot_combination_counts(all_mod_combinations, combination_counts, fig_name="num_combinations_sorted_snr3_10ppm_top50.svg", plot_fractions=[0.8, 0.9, 0.95], topk=50, figsize=(2, 15), xlim=(0, 7000))

In [None]:
plot_combination_counts(all_mod_combinations_wanted, combination_counts_wanted, plot_fractions=[0.8, 0.9, 0.95], line_label_pos_fraction =2, figsize=(3, 15), xlim=(0, 9000), fig_name="num_combinations_sorted_snr3_10ppm_wanted_combinations.svg")

In [None]:
# Counts for immonium ions of the 8 PTMs occurring alone in a spectrum
single_mod_combinations_idcs = [(idx, comb) for idx, comb in enumerate(all_mod_combinations_wanted) if len(comb) == 1]
single_mod_combinations = np.array([comb for idx, comb in single_mod_combinations_idcs])
single_mod_counts = np.array(combination_counts_wanted[[idx for idx, _ in single_mod_combinations_idcs]])
single_mod_counts.sum(), combination_counts_wanted.sum()

In [None]:
single_mod_combinations

In [None]:
plot_combination_counts(single_mod_combinations, single_mod_counts, fig_name="num_combinations_sorted_snr3_10ppm_wanted_single.svg", figsize=(7, 4), xlim=(0, 6700))