In [None]:
import pyteomics.mgf
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

import pandas_utils as pu
import numpy as np

In [None]:
def save_fig(file_name):
    plt.savefig(r"C:\git\msn_library\figures/{}.png".format(file_name), dpi=300)
    plt.savefig(r"C:\git\msn_library\figures/{}.pdf".format(file_name))
    plt.savefig(r"C:\git\msn_library\figures/{}.svg".format(file_name))

sns.set_theme(font_scale=2, style="ticks")

In [None]:
method1 = r"C:\git\msn_library\library\method_comparison\20231113_mce_library_pos_AGC40_Res15K_lib_MSn.mgf"
method2 = r"C:\git\msn_library\library\method_comparison\20231113_mce_library_pos_AGC100_Res15K_lib_MSn.mgf"
method3 = r"C:\git\msn_library\library\method_comparison\20231113_mce_library_pos_AGC100_Res60K_lib_MSn.mgf"

In [None]:
def read_mgf(infile):
    import re
    rows = []
    counter = 0
    
    with pyteomics.mgf.MGF(infile) as f_in:
        for spectrum_dict in tqdm(f_in):
            if spectrum_dict is not None:
                rows.append(spectrum_dict["params"])
            else:
                counter += 1
    
    df = pd.DataFrame(rows)
    if "inchikey" not in df.columns:
        df["inchikey"] = df["inchiaux"]
    if "compound_name" not in df.columns:
        df["compound_name"] = df["name"]
    if "monoisotopic_mass" not in df.columns:
        df["monoisotopic_mass"] = df["exactmass"]
    #
    if "usi" in df.columns:
        df["unique_sample_id"] = ["pluskal{}_id".format(re.search(r'pluskal(.*?)_id', usi).group(1))  for usi in df["usi"]]
    return df

In [None]:
method1_df = read_mgf(method1)
method2_df = read_mgf(method2)
method3_df = read_mgf(method3)

In [None]:
method1_df["method"] = "40AGC"
method2_df["method"] = "100AGC"
method3_df["method"] = "100AGC_60KRes"

In [None]:
merged_df = pd.concat([method1_df, method2_df, method3_df], ignore_index=True)
merged_df["num peaks"] = merged_df["num peaks"].astype(int)
merged_df["mslevel"] = merged_df["mslevel"].astype(int)


In [None]:
merged_df

In [None]:
merged_df[["method"]].value_counts()

In [None]:
evaluation_bins = [0, 0.5, 1, 2, 3, 4, 6, 10, 20, 50, 100, np.inf]
group_names =     ["0", '1', '2', '3', '4', '5-6', '7-10', '11-20', '21-50', '51-100', '>100']


merged_df["signals_simple"] = pd.cut(merged_df['num peaks'], bins=evaluation_bins,
                                  labels=group_names, include_lowest=True, right=True)

In [None]:
merged_df

In [None]:
test = merged_df.groupby('method')
test["num peaks"].mean()

In [None]:
figure_name = "n_signals_categories_method_comparison"
plt.figure(figsize=(20, 9))
# sns.set_style("white")
ax = sns.histplot(data=merged_df, x="signals_simple", hue="method", fill=False)
plt.xlabel("Number of signals")
plt.ylabel("Count")

save_fig(figure_name)

In [None]:
mslevel3 = merged_df[merged_df['mslevel'] == 3]
mslevel3to5 = merged_df[merged_df['mslevel'] >= 3]

In [None]:
figure_name = "n_signals_categories_method_comparison_mslevel3"
plt.figure(figsize=(20, 9))
# sns.set_style("white")
ax = sns.histplot(data=mslevel3, x="signals_simple", hue="method", fill=False)
plt.xlabel("Number of signals")
plt.ylabel("Count")

save_fig(figure_name)

In [None]:
figure_name = "n_signals_categories_method_comparison_mslevel3to5"
plt.figure(figsize=(20, 9))
# sns.set_style("white")
ax = sns.histplot(data=mslevel3to5, x="signals_simple", hue="method", fill=False)
plt.xlabel("Number of signals")
plt.ylabel("Count")

save_fig(figure_name)

In [None]:
statistics = {
    "number scans": merged_df[["method"]].value_counts(),
    "mean msn":  merged_df.groupby('method').mean(),
    "number of MS3to5 scans": mslevel3to5[["method"]].value_counts(),
    "mean ms3to5":  mslevel3to5.groupby('method').mean(),
    "number of MS3 scans": mslevel3[["method"]].value_counts(),
    "mean ms3":  mslevel3.groupby('method').mean(),
}

print("\n")
for key, v in statistics.items():
    print("{}\t{}".format(key, v))

In [None]:
best_df = merged_df.sort_values('num peaks', ascending=False).drop_duplicates(['unique_sample_id', "compound_name", "method"], keep="first").sort_index()

In [None]:
best_df["method"].value_counts()

In [None]:
plt.figure(figsize=(20, 9))
ax = sns.histplot(data=best_df, x="signals_simple", hue="method", fill=False)
# save_fig("n_signals_best_filtered_categories_pp")