In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json

In [None]:
spectra_df = pd.read_csv("data/msn1D1A_pos_100AGC.csv", sep=",")
spectra_df.INTENSITIES = spectra_df.INTENSITIES.apply(lambda v: json.loads(v))
spectra_df.MZS = spectra_df.MZS.apply(lambda v: json.loads(v))
spectra_df.PRECURSOR_LIST = spectra_df.PRECURSOR_LIST.apply(lambda v: json.loads(v))
spectra_df["FILENAME_NO_EXT"] = spectra_df["FILENAME"].apply(lambda v: v.split(".")[0])
spectra_df["lib_plate_well"] = spectra_df["FILENAME_NO_EXT"].apply(lambda v: "pluskal_{}".format(v.split("pluskal_")[1]))
spectra_df.info()

In [None]:
def get_intensities(df, min_ms_level, max_ms_level=None):
    if max_ms_level is None or max_ms_level<=min_ms_level:
        filtered = df[df["MS_LEVEL"]==min_ms_level]
    else:
        filtered = df[df["MS_LEVEL"].between(min_ms_level,max_ms_level, 'both')]
    return np.concatenate(filtered.INTENSITIES.values)

def get_mzs(df: pd.DataFrame, min_ms_level: int, max_ms_level=None):
    if max_ms_level is None or max_ms_level<=min_ms_level:
        filtered = df[df["MS_LEVEL"]==min_ms_level]
    else:
        filtered = df[df["MS_LEVEL"].between(min_ms_level,max_ms_level, 'both')]
    return np.concatenate(filtered.MZS.values)

def get_datapoints(df, min_ms_level, max_ms_level=None):
    if max_ms_level is None or max_ms_level<=min_ms_level:
        filtered = df[df["MS_LEVEL"]==min_ms_level]
    else:
        filtered = df[df["MS_LEVEL"].between(min_ms_level,max_ms_level, 'both')]

    return np.concatenate(filtered.MZS.values), np.concatenate(filtered.INTENSITIES.values)

In [None]:
def save_fig(file_name):
    plt.savefig("figures/{}.png".format(file_name), dpi=300)
    plt.savefig("figures/{}.pdf".format(file_name))
    plt.savefig("figures/{}.svg".format(file_name))

In [None]:
spectra_df["MIN_INTENSITY"] = spectra_df["INTENSITIES"].apply(lambda intensities: min(intensities))
spectra_df["N_SIGNALS_FILTERED"] = [sum(value > min_intensity*3 for value in intensities) for intensities, min_intensity in zip(spectra_df["INTENSITIES"], spectra_df["MIN_INTENSITY"])]
spectra_df

## Add compound matches to spectra_df

In [None]:
lib_id = "mce"
filename = "data/lib_formatted_{}.csv".format(lib_id)
lib_df = pd.read_csv(filename, sep="\t")
lib_df["all_ions"] = lib_df.apply(lambda row: [row["mz_h_smiles"], row["mz_na_smiles"]], axis=1)
lib_df.head()

In [None]:
def find_match(spectrum_row, lib_df):
    if spectrum_row["MS_LEVEL"] > 2:
        return []

    precursor_mz = spectrum_row["PRECURSOR_MS2"]
    filtered_df = lib_df[(lib_df["lib_plate_well"] == spectrum_row["lib_plate_well"]) & lib_df["mz_h_smiles"].between(precursor_mz-mz_tolerance, precursor_mz+mz_tolerance)]

    return filtered_df.index.tolist()

def propagate_match(spectrum_row, spectra_df):
    tree_id = spectrum_row["TREE_ID"]
    filtered_df = spectra_df[(spectra_df["TREE_ID"] == tree_id) & (spectra_df["MS_LEVEL"] == 2)]
    return filtered_df["LIB_INDEX"].values[0]

In [None]:
mz_tolerance = 0.05

# find index in metadata lib_df
#filtered_spectra = spectra_df.loc[spectra_df["MS_LEVEL"] == 2]
spectra_df["LIB_INDEX"] = spectra_df.apply(lambda row: find_match(row, lib_df), axis=1)
# propagate index to MSn
spectra_df["LIB_INDEX"] = spectra_df.apply(lambda row: propagate_match(row, spectra_df), axis=1)
# transfer metadata columns
spectra_df["PRODUCT_NAME"] = spectra_df["LIB_INDEX"].apply(lambda index: lib_df.at[index[0], "Product Name"] if len(index) >= 1 else "")
spectra_df["SMILES"] = spectra_df["LIB_INDEX"].apply(lambda index: lib_df.at[index[0], "Smiles"] if len(index) >= 1 else "")
spectra_df["EXACT_MASS"] = spectra_df["LIB_INDEX"].apply(lambda index: lib_df.at[index[0], "exact_mass_smiles"] if len(index) >= 1 else np.nan)
spectra_df["ANNOTATED"] = spectra_df["LIB_INDEX"].apply(lambda index: len(index) >= 1)
spectra_df["N_MATCHES"] = spectra_df["LIB_INDEX"].apply(lambda index: len(index))

In [None]:
# find all with match, keep best spectrum (highest N of signals) for each precursor of each compound
matched_df = spectra_df[spectra_df["N_MATCHES"] >= 1]
matched_df["FIRST_LIB_INDEX"] = [lib_ids[0] if len(lib_ids)>0 else "" for lib_ids in matched_df['LIB_INDEX']]
matched_df["PRECURSOR_MZ_LOW_RES"] = [round(mz, 2) for mz in matched_df['PRECURSOR_MZ']]
matched_df = matched_df.sort_values('N_SIGNALS_FILTERED', ascending=False).drop_duplicates(['FIRST_LIB_INDEX','PRECURSOR_MZ_LOW_RES'], keep="first").sort_index()

In [None]:
plt.figure(figsize=(20, 6))
ax = sns.histplot(data=matched_df, x="N_SIGNALS_FILTERED", binwidth=1)
ax.set_xlim(0,150)
ax.set_ylim(0,125)
save_fig("histo_signals_gr_3xmin_intensity")

In [None]:
matched_df

In [None]:
spectra_df.to_csv("data/spectra_match_{}.csv".format(lib_id), sep="\t", index=False)

## Analyze

In [None]:
levels = [2,3,4,5]
fig, axs = plt.subplots(len(levels), sharex="all", figsize=(15,15))

fig.suptitle('Vertically stacked subplots')

for i, level in enumerate(levels):
    ax = sns.histplot(data=spectra_df[spectra_df["MS_LEVEL"]==level], ax=axs[i], x="PRECURSOR_MZ", binwidth=0.1)
    ax.set(xlabel='precursor m/z', ylabel='MS{} counts'.format(level))
    ax.set_xlim(70,1000)
    #ax.set_ylim(0, 40)

In [None]:
plt.figure(figsize=(20, 6))
mzs, intensities = get_datapoints(spectra_df, 3, 6)

df = pd.DataFrame({"mz": mzs, "intensity": intensities})
ax = sns.histplot(data=df[df["intensity"]>1500], x="mz", binwidth=0.1)
ax.set_xlim(40,800)
# ax.set_xlim(50,500)

In [None]:
plt.figure(figsize=(20, 6))
intensities = get_intensities(spectra_df, 3)
df = pd.DataFrame({"INTENSITIES": intensities})
count_intensities_MS3 = sns.histplot(data=df[df["INTENSITIES"]<20000], x="INTENSITIES", binwidth=100)
count_intensities_MS3.set_xlim(0,20000)
save_fig("histo_intensity_ms3")

In [None]:
plt.figure(figsize=(20, 6))
intensities = get_intensities(spectra_df, 2)
df = pd.DataFrame({"INTENSITIES": intensities})
count_intensities_MS2 = sns.histplot(data=df[df["INTENSITIES"]<20000], x="INTENSITIES", binwidth=50)
count_intensities_MS2.set_xlim(0,20000)
save_fig("histo_intensities_MS2")

In [None]:
plt.figure(figsize=(20, 6))
intensities = get_intensities(spectra_df, 4)
df = pd.DataFrame({"INTENSITIES": intensities})
count_intensities_MS2 = sns.histplot(data=df[df["INTENSITIES"]<20000], x="INTENSITIES", binwidth=50)
count_intensities_MS2.set_xlim(0,20000)
save_fig("histo_intensities_MS4")

In [None]:
Precursor_against_mslevel = sns.violinplot(data=spectra_df, x="MS_LEVEL", y="PRECURSOR_MZ", cut=0)
save_fig("Violin_Precursor_against_mslevel")

In [None]:
Signals_against_MSlevel = sns.violinplot(data=spectra_df, x="MS_LEVEL", y="N_SIGNALS", cut=0)
save_fig("Violin_Signals_against_MSlevel")

In [None]:
sns.histplot(data=spectra_df, x="PRECURSOR_MZ", hue="MS_LEVEL", multiple="stack")
save_fig("histo_MSlevel")

In [None]:

intensities  = get_intensities(spectra_df, 2)
df = pd.DataFrame({"INTENSITIES": intensities})
df[df["INTENSITIES"]>10000]