In [None]:
import pandas as pd

import pyteomics.mgf
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import pandas_utils as pu

In [None]:
def save_fig(file_name):
    plt.savefig(r"C:\git\msn_library\figures/{}.png".format(file_name), dpi=300)
    plt.savefig(r"C:\git\msn_library\figures/{}.pdf".format(file_name))
    plt.savefig(r"C:\git\msn_library\figures/{}.svg".format(file_name))

In [None]:
# infile = r"C:\git\msn_library\data\gnpslib\small_gnps.mgf"
# infile = r"C:\git\msn_library\data\gnpslib\ALL_GNPS_NO_PROPOGATED.mgf"
# infile = r"C:\git\msn_library\data\masst\20230312_mce_library_pos_all_lib_MS2.mgf"
infile = r"C:\git\msn_library\library\20230811_mce_library_pos_all_lib_MSn.mgf"
libraryfile = r"C:\git\msn_library\data\library\mce_library_all_cleaned.tsv"

In [None]:
import re
rows = []
counter = 0

with pyteomics.mgf.MGF(infile) as f_in:
    for spectrum_dict in tqdm(f_in):
        if spectrum_dict is not None:
            rows.append(spectrum_dict["params"])
        else:
            counter += 1

df = pd.DataFrame(rows)
if "inchikey" not in df.columns:
    df["inchikey"] = df["inchiaux"]
if "compound_name" not in df.columns:
    df["compound_name"] = df["name"]
if "monoisotopic_mass" not in df.columns:
    df["monoisotopic_mass"] = df["exactmass"]
#
if "usi" in df.columns:
    df["unique_sample_id"] = ["pluskal{}_id".format(re.search(r'pluskal(.*?)_id', usi).group(1))  for usi in df["usi"]]


In [None]:
libdf = pu.read_dataframe(libraryfile)

## NIH Remove missing plate

In [None]:
# libdf = pu.read_dataframe(libraryfile)
# libdf = libdf[(libdf["monoisotopic_mass"]> 114) & (libdf["plate_id"]!= "07P")]

In [None]:
df.drop_duplicates(["inchikey", "unique_sample_id"])

In [None]:
df.drop_duplicates(["inchikey"])

In [None]:
df.drop_duplicates(["compound_name"])

In [None]:
df.groupby(["quality_chimeric"]).count()

In [None]:
multimatch_df = df.loc[df["other_matched_compounds"] >= "1"]

In [None]:
multimatch_df.sort_values("quality_chimeric").drop_duplicates(["compound_name"]).sort_values("unique_sample_id")

In [None]:
multimatch_df

## Keep only one scan for each compound and unique_sample_id

In [None]:
filtered_df = df.drop_duplicates(["inchikey", "unique_sample_id"])

In [None]:
libdf.groupby(['unique_sample_id']).count()

## Get missing compounds (comparing library spectra with metadata)

In [None]:
df_diff = pd.merge(libdf, filtered_df, on=["inchikey", "unique_sample_id"], how="outer", indicator="Exist", suffixes=("", "_data"))
df_diff = df_diff.loc[df_diff["Exist"] != "both"]
df_diff[["unique_sample_id", "monoisotopic_mass", "inchikey", "formula", "Exist", "structure_source", "compound_name", "molecular_species", "logp", "Exist"]]

In [None]:
libdf[["unique_sample_id", "monoisotopic_mass", "inchikey", "formula", "compound_name"]].loc[libdf["unique_sample_id"] == "pluskal_mce_1D1_A8_id"]

In [None]:
df_diff

In [None]:
libdf.groupby(["molecular_species"]).count()

In [None]:
df_diff.groupby(['molecular_species']).count()

## Keep compounds with logp > x

In [None]:
df_diff5 = df_diff.loc[df_diff["logp"] >= 5]

In [None]:
df_diff5['group_count'] = df_diff5.groupby('unique_sample_id')['unique_sample_id'].transform('count')
df_diff5["unique_sample_id"].unique()

In [None]:
df_diff.to_csv("data/nih/nih_ms_library_missing_entry.tsv", sep="\t", index=False)

In [None]:
df_diff.drop_duplicates(["Product Name", "lib_plate_well"])

## Add metadata to library spectra

In [None]:
libdf.groupby(["compound_name", "unique_sample_id"]).count()

In [None]:
libdf = libdf.sort_values(by="none")
libdf = libdf[libdf["monoisotopic_mass"].notna()]
libdf = libdf.drop_duplicates(["unique_sample_id", "compound_name", "monoisotopic_mass"])
libdf

In [None]:
libdf[["input_name", "unique_sample_id"]][libdf["compound_name"]=="1610358-53-6"]

In [None]:
libdf.drop_duplicates(["inchikey"])

In [None]:
libdf.drop_duplicates(["unique_sample_id", "compound_name"])

In [None]:
df["monoisotopic_mass"] = df["monoisotopic_mass"].astype("float")
df["quality_explained_intensity"] = df["quality_explained_intensity"].astype("float")
df["num peaks"] = df["num peaks"].astype("int")
df["quality_explained_signals"] = df["quality_explained_signals"].astype("float")
df["precursor_purity"] = df["precursor_purity"].astype("float")
columns = ["unique_sample_id", "inchikey", "monoisotopic_mass", "compound_name"]
df_meta = df.merge(libdf, on=columns, how="left")
df_meta

In [None]:
df_uni = df_meta.drop_duplicates(["unique_sample_id", "inchikey", "monoisotopic_mass"])
df_uni

## Filtering by phase

In [None]:
phase_df = df[df["clinical_phase"]>0]
phase_df

In [None]:
phase_df.groupby("inchi_key_x").count()

In [None]:
approved_df = df[df["clinical_phase"]==4]
approved_df

In [None]:
approved_df.groupby("inchi_key_x").count()

In [None]:
approved_df.groupby("name").count()

In [None]:
df.groupby("lib_plate_well").count()

## Figures

In [None]:
df["quality_explained_intensity"] = df["quality_explained_intensity"].astype("float")
df["num peaks"] = df["num peaks"].astype("int")
df["quality_explained_signals"] = df["quality_explained_signals"].astype("float")
ax = sns.jointplot(data=df[df["num peaks"]< 500], x="quality_explained_intensity", y="num peaks", kind="hist", ratio=2)
ax.set_axis_labels("Explained Intensity", "Number of Peaks")
# plt.savefig("figures/mce_msn_library/data_quality_msn.png", dpi=300)

In [None]:
sns.set_theme(font_scale=2, style="ticks")

In [None]:
file_name = "explained_intensity"

plt.figure(figsize=(13, 9))
ax = sns.histplot(data=df, x="quality_explained_intensity", binwidth=0.02, kde=False)
ax.set(xlabel="Explained Intensity")
# save_fig(file_name)

In [None]:
file_name = "explained_intensity_zoom"

plt.figure(figsize=(13, 9))
ax = sns.histplot(data=df, x="quality_explained_intensity", binwidth=0.02, kde=False)
ax.set_ylim(0, 10000)
ax.set(xlabel="Explained Intensity")
save_fig(file_name)

In [None]:
file_name = "explained_signals"

plt.figure(figsize=(13, 9))
ax = sns.histplot(data=df, x="quality_explained_signals", binwidth=0.02, kde=False)
# ax.set_xlim(0,2000)
ax.set(xlabel="Explained Signals")
save_fig(file_name)

In [None]:
file_name = "number_of_signals"

plt.figure(figsize=(13, 9))
ax = sns.histplot(data=df, x="num peaks", binwidth=1, kde=False)
ax.set_xlim(0,50)
ax.set(xlabel="Number of Signals")
save_fig(file_name)

In [None]:
file_name = "Purity_comparison"

plt.figure(figsize=(13, 9))
sns.jointplot(data=df, x="precursor_purity", y="quality_explained_intensity", kind="hex",
              xlim=(0, 1.0),
              ylim=(0, 1.0)
              )

In [None]:
file_name = "precursor_purity"
plt.figure(figsize=(13, 9))

sns.histplot(df, x="precursor_purity", binwidth=0.01
                  )
save_fig(file_name)

In [None]:
file_name = "precursor_purity_zoom"
plt.figure(figsize=(13, 9))

sns.histplot(df, x="precursor_purity", binwidth=0.01
             ).set_ylim(0,40000)
save_fig(file_name)

In [None]:
df[df["quality_explained_intensity"]<0.25].groupby("quality_chimeric").count()