In [None]:
import pandas as pd

import pyteomics.mgf
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# infile = r"C:\git\msn_library\data\gnpslib\small_gnps.mgf"
infile = r"C:\git\msn_library\data\gnpslib\ALL_GNPS_NO_PROPOGATED.mgf"
# infile = r"C:\git\msn_library\data\masst\20230312_mce_library_pos_all_lib_MS2.mgf"

In [None]:
import re
rows = []
counter = 0

with pyteomics.mgf.MGF(infile) as f_in:
    for spectrum_dict in tqdm(f_in):
        if spectrum_dict is not None:
            rows.append(spectrum_dict["params"])
        else:
            counter += 1

df = pd.DataFrame(rows)
if "inchi_key" not in df.columns:
    df["inchi_key"] = df["inchiaux"]
if "compound_name" not in df.columns:
    df["compound_name"] = df["name"]
if "exact_mass" not in df.columns:
    df["exact_mass"] = df["pepmass"]
#
if "usi" in df.columns:
    df["lib_plate_well"] = ["pluskal{}".format(re.search(r'pluskal(.*?).mzML', usi).group(1))  for usi in df["usi"]]

df

In [None]:
df.drop_duplicates(["inchi_key", "lib_plate_well"])

In [None]:
df.drop_duplicates(["inchi_key"])

In [None]:
df.groupby(["quality_chimeric"]).count()

In [None]:
multimatch_df = df.loc[df["other_matched_compounds"]== "1"]

In [None]:
multimatch_df.sort_values("quality_chimeric").drop_duplicates(["compound_name"]).sort_values("lib_plate_well")

In [None]:
multimatch_df

In [None]:
filtered_df = df.drop_duplicates(["inchi_key", "lib_plate_well"])
filtered_df

In [None]:
libdf = pd.read_csv(r"C:\git\msn_library\data\final_tables\mce_library_all_final_unique_id.tsv", sep="\t")
libdf

In [None]:
libdf.groupby(['lib_plate_well_unique']).count()

In [None]:
df_diff = pd.merge(libdf, filtered_df, on=["inchi_key", "lib_plate_well"], how="outer", indicator="Exist")
df_diff = df_diff.loc[df_diff["Exist"] != "both"]
df_diff

In [None]:
df_diff.groupby(['molecular_species']).count()

In [None]:
df_diff.to_csv("data/final_tables/mce_missing_ms_missing_entry.tsv", sep="\t", index=False)

In [None]:
libdf.loc[libdf["clinical_phase"]==4]

In [None]:
df_diff.loc[df_diff["clinical_phase"]==4]

In [None]:
df_diff.drop_duplicates(["Product Name", "lib_plate_well"])

In [None]:
libdf.groupby(["lib_plate_well", "compound_name"]).count()

In [None]:
libdf = libdf.sort_values(by="none")
libdf = libdf[libdf["exact_mass"].notna()]
libdf = libdf.drop_duplicates(["lib_plate_well", "compound_name", "exact_mass"])
libdf

In [None]:
libdf[libdf["compound_name"]=="V5 Epitope Tag Peptide (Trifluoroacetate)"]

In [None]:
libdf.drop_duplicates(["lib_plate_well", "compound_name"])

In [None]:
libdf[]

In [None]:
libdf.loc

In [None]:
df["exact_mass"] = df["exactmass"].astype("float")
columns = ["lib_plate_well", "compound_name", "exact_mass"]
df = df.merge(libdf, on=columns, how="left")
df

In [None]:
phase_df = df[df["clinical_phase"]>0]
phase_df

In [None]:
phase_df.groupby("inchi_key_x").count()

In [None]:
approved_df = df[df["clinical_phase"]==4]
approved_df

In [None]:
approved_df.groupby("inchi_key_x").count()

In [None]:
approved_df.groupby("name").count()

In [None]:
df.groupby("lib_plate_well").count()

In [None]:
df["quality_explained_intensity"] = df["quality_explained_intensity"].astype("float")
df["num peaks"] = df["num peaks"].astype("int")
df["quality_explained_signals"] = df["quality_explained_signals"].astype("float")
ax = sns.jointplot(data=df[df["num peaks"]< 500], x="quality_explained_intensity", y="num peaks", kind="hist", ratio=2)
ax.set_axis_labels("Explained Intensity", "Number of Peaks")
plt.savefig("figures/mce_msn_library/data_quality_msn.png", dpi=300)

In [None]:
def save_fig(file_name):
    plt.savefig("figures/mce_msn_library/{}.png".format(file_name), dpi=300)
    plt.savefig("figures/mce_msn_library/{}.pdf".format(file_name))
    plt.savefig("figures/mce_msn_library/{}.svg".format(file_name))

sns.set_theme(font_scale=2, style="ticks")

In [None]:
file_name = "explained_intensity"

plt.figure(figsize=(13, 9))
ax = sns.histplot(data=df, x="quality_explained_intensity", binwidth=0.02, kde=False)
# ax.set_xlim(0,2000)
ax.set(xlabel="Explained Intensity")
save_fig(file_name)

In [None]:
file_name = "explained_signals"

plt.figure(figsize=(13, 9))
ax = sns.histplot(data=df, x="quality_explained_signals", binwidth=0.02, kde=False)
# ax.set_xlim(0,2000)
ax.set(xlabel="Explained Signals")
save_fig(file_name)

In [None]:
file_name = "number_of_signals"

plt.figure(figsize=(13, 9))
ax = sns.histplot(data=df, x="num peaks", binwidth=1, kde=False)
ax.set_xlim(0,50)
ax.set(xlabel="Number of Signals")
save_fig(file_name)

In [None]:
df[df["quality_explained_intensity"]<0.25].groupby("quality_chimeric").count()

In [None]:
file = r"C:\git\msn_library\data\final_tables\gnps_phase.tsv"
df = pd.read_csv(file, sep="\t")
df[df["clinical_phase"]]

In [None]:
df = pd.read_csv("data/final_tables/mce_library_all_approved.tsv", sep="\t")
df

In [None]:
df[df["prodrug"]==1].drop_duplicates("inchi_key")

In [None]:
df = pd.read_csv("data/final_tables/mce_library_none.tsv", sep="\t")
df["cas"]=df["cas"].astype(str)


In [None]:
df.dtypes