In [None]:
import numpy as np
import pandas as pd
import glob
import os

#### `1) GNPS annotations`

This step requires an independent job at GNPS Spectral Library Search (input: raw mzML files)
See documentation: https://ccms-ucsd.github.io/GNPSDocumentation/librarysearch/ 

When the job is finished, import all identifications from GNPS, save the .TSV table under the resources directory and "clean up" the table:

In [None]:
df= pd.read_csv(os.path.join("resources", "MS2_LIBRARYSEARCH_all_identifications.tsv"), sep="\t", encoding="latin-1")
df.drop(df.index[df["IonMode"] == "negative"], inplace=True)
df.drop(df.index[df["MZErrorPPM"] > 20.0], inplace=True)
GNPS=df.filter(["Compound_Name", "RT_Query", "Precursor_MZ"])
GNPS=GNPS.rename(columns= {"RT_Query": "RetentionTime"})
GNPS=GNPS.drop_duplicates(subset="Compound_Name", keep="first")
GNPS

In [None]:
FeatureMatrix= pd.read_csv(os.path.join("results", "features", "FeatureMatrix.tsv"), sep="\t")
FeatureMatrix

Annotate the features detected by GNPS according to mz and RT (mz tolerance 10 ppm and RT tolerance 20 seconds: instrument and method-dependent)

In [None]:
FeatureMatrix.insert(0, "GNPS_IDs", "")

for i, mz, rt in zip(FeatureMatrix.index, FeatureMatrix["mz"], FeatureMatrix["RT"]):
    hits = []
    for name, GNPS_mz, GNPS_rt, in zip(GNPS["Compound_Name"], GNPS["Precursor_MZ"], GNPS["RetentionTime"]):
        mass_delta = (abs(GNPS_mz-mz)/GNPS_mz)*1000000.0 if GNPS_mz != 0 else np.nan
        if (GNPS_rt >= rt-20.0) & (GNPS_rt <= rt+20.0) & (mass_delta<= 10.0):
            hit = f"{name}"
            if hit not in hits:
                hits.append(hit)
    FeatureMatrix["GNPS_IDs"][i] = " ## ".join(hits)

FeatureMatrix.to_csv(os.path.join("results", "annotations", "GNPS_annotated_feature_matrix.tsv"), sep="\t", index = False)
FeatureMatrix

Keep the unannotated features only

In [None]:
FeatureMatrix= FeatureMatrix[FeatureMatrix.GNPS_IDs == ""]
FeatureMatrix= FeatureMatrix.drop(columns= "GNPS_IDs")
FeatureMatrix= FeatureMatrix.set_index(["RT", "mz"])
FeatureMatrix_tocsv= FeatureMatrix.reset_index()
FeatureMatrix_tocsv.to_csv(os.path.join("results", "annotations", "FeatureMatrix_unknowns.tsv"), sep="\t", index =None)
FeatureMatrix

#### `2) SIRIUS and CSI:FingerID annotations`

Create a matrix with all SIRIUS and CSI:FingerID formula and structural predictions, only choose #1 rankings predictions and combine the dataframes to annotate formula and structural predictions according to RT and mz:

In [1]:
input_formulas = glob.glob(os.path.join("results", "formulas", "formulas_*.csv"))
input_structures = glob.glob(os.path.join("results", "formulas", "structures_*.csv"))

df_formulas will likely contain duplicate formulas that could be either isomeric, isobaric compounds, or identical compounds (with identical RT and mz). Here, we want to collapse the identical, repeating compounds

In [None]:
DF_SIRIUS = pd.DataFrame()
list_of_df=[]
for csv in input_formulas:
    df= pd.read_csv(csv, sep=",", index_col="Unnamed: 0")
    s= df["opt_global_rank"]
    pd.to_numeric(s)
    df= df.loc[df["opt_global_rank"]==1]
    df_score=df.filter(regex=fr"Score")
    df_opt=df.filter(regex=fr"opt")
    cols_score= df_score.columns
    cols_opt= df_opt.columns
    df= df.drop(columns=cols_score)
    df= df.drop(columns= cols_opt)
    df=df.reset_index()
    list_of_df.append(df)

DF_SIRIUS= pd.concat(list_of_df,ignore_index=True)
DF_SIRIUS= DF_SIRIUS.drop(columns="index")
df_formulas= DF_SIRIUS.rename(columns= {"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT"})
df_formulas = df_formulas.set_index("formulas")
df_singletons=df_formulas.reset_index().drop_duplicates(subset="formulas", keep=False)

df_singletons= df_singletons.set_index("formulas")
idx= df_singletons.index
df_sirius= df_formulas.drop(idx)
new_df= pd.DataFrame()
df= pd.DataFrame()
idx= df_sirius.index
for i, index in enumerate(idx):
    new_index= new_df.index
    if index not in new_index:
        s= df_sirius.iloc[i]
        new_df= new_df.append(s)
    else:
        mz_0= df_sirius["mz"][i]
        mz_1= new_df["mz"][index]
        time_0= df_sirius["RT"][i]
        time_1= new_df["RT"][index]
        mass_delta = (abs(mz_0 - mz_1)/mz_0)*1000000
        maxdeltaRT = time_0 + 30.0
        mindeltaRT = time_0 - 30.0
        if (mindeltaRT<= time_1 <= maxdeltaRT) & (mass_delta<= 20.0):
            pass
        else:
            m= df_sirius.iloc[i]
            df= df.append(m)

DF_SIRIUS= pd.concat([new_df, df], axis=0)
DF_SIRIUS_final= pd.concat([DF_SIRIUS, df_singletons], axis=0)
DF_SIRIUS_final= DF_SIRIUS_final.reset_index()
DF_SIRIUS_final= DF_SIRIUS_final.rename(columns={"index":"formulas"})
DF_SIRIUS_final.to_csv(os.path.join("results", "annotations", "SIRIUS_library.csv"), sep="\t", index=None)
DF_SIRIUS_final

Repeat for structral predictions (remove duplicates with the same inchi_keys, which means they represent the same structure):

In [None]:
DF_CSI= []
for i, formulas in enumerate(input_structures):
    df= pd.read_csv(formulas, index_col="Unnamed: 0")
    df= df.loc[df["opt_global_rank"]==1]
    df_score=df.filter(regex=fr"best_search_engine_score")
    df_opt=df.filter(regex=fr"opt")
    cols_score= df_score.columns
    cols_opt= df_opt.columns
    df= df.drop(columns=cols_score)
    df= df.drop(columns= cols_opt)
    df= df.drop(columns= "identifier")
    df=df.reset_index()
    df= df.drop(columns="index")
    DF_CSI.append(df)


df_structures= pd.concat(DF_CSI, axis=0).sort_values("chemical_formula")
df_structures = df_structures.drop_duplicates(subset=["inchi_key"], keep="first")
df_structures= df_structures.drop(columns=["inchi_key"]) #leave smiles for visualisationdf_structures= df_structures.rename(columns={"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT"})
df_structures= df_structures.rename(columns={"chemical_formula":"formulas"})
df_structures= df_structures.set_index("formulas")
df_singletons=df_structures.reset_index().drop_duplicates(subset="formulas", keep=False)
df_singletons= df_singletons.set_index("formulas")
idx= df_singletons.index
df_CSI= df_structures.drop(labels=idx, axis=0)
new_df= pd.DataFrame()
df= pd.DataFrame()
idx= df_CSI.index
for i, index in enumerate(idx):
    new_index= new_df.index
    if index not in new_index:
        s= df_CSI.iloc[i]
        new_df= new_df.append(s)
    else:
        #print(index)
        mz_0= df_CSI["exp_mass_to_charge"][i]
        mz_1= new_df["exp_mass_to_charge"][index]
        time_0= df_CSI["retention_time"][i]
        time_1= new_df["retention_time"][index]
        #(print(mz_0, time_0, mz_1, time_1))
        mass_delta = (abs(mz_0 - mz_1)/mz_0)*1000000
        maxdeltaRT = time_0 + 30.0
        mindeltaRT = time_0 - 30.0
        if (mindeltaRT<= time_1 <= maxdeltaRT) & (mass_delta<= 20.0):
            pass
        else:
            m= df_CSI.iloc[i]
            df= df.append(m)


DF_CSI= pd.concat([new_df, df], axis=0)
DF_CSI_final= pd.concat([DF_CSI, df_singletons], axis=0)
DF_CSI_final= DF_CSI_final.reset_index()
DF_CSI_final= DF_CSI_final.rename(columns={"index":"formulas"})
DF_CSI_final.to_csv(os.path.join("results", "annotations", "CSI_library.csv"), sep="\t", index= None)
DF_CSI_final

Annotate the formulas and structural predictions to the feature matrix according to SIRIUS and CSI (you can also only choose the feature matrix with unknown features generated above):

In [None]:
DF_features= pd.read_csv(os.path.join("results", "features", "FeatureMatrix.tsv"), sep="\t")
DF_features

In [None]:
DF_features.insert(0, "CSI_predictions_name", "")
DF_features.insert(0, "CSI_predictions_formula", "")
DF_features.insert(0, "CSI_predictions_smiles", "")


for i, mz, rt in zip(DF_features.index, DF_features["mz"], DF_features["RT"]):
    hits1 = []
    hits2= []
    hits3=[]
    for name, smiles, formula, Pred_mz, Pred_rt, in zip(DF_CSI_final["description"], DF_CSI_final["smiles"], DF_CSI_final["formulas"], DF_CSI_final["exp_mass_to_charge"], DF_CSI_final["retention_time"]):
        mass_delta = (abs(Pred_mz-mz)/Pred_mz)*1000000.0 if Pred_mz != 0 else 0
        if (Pred_rt >= rt-30.0) & (Pred_rt <= rt+30.0) & (mass_delta<= 20.0):
            hit1 = f"{name}"
            hit2 = f"{formula}"
            hit3= f"{smiles}"
            if hit1 not in hits1:
                hits1.append(hit1)
                hits2.append(hit2)
                hits3.append(hit3)
    DF_features["CSI_predictions_name"][i] = " ## ".join(hits1)
    DF_features["CSI_predictions_formula"][i] = " ## ".join(hits2)
    DF_features["CSI_predictions_smiles"][i] = " ## ".join(hits3)
DF_features

In [None]:
DF_features.insert(0, "SIRIUS_predictions", "")

for i, mz, rt in zip(DF_features.index, DF_features["mz"], DF_features["RT"]):
    hits = []
    for name, Pred_mz, Pred_rt, in zip(DF_SIRIUS_final["formulas"], DF_SIRIUS_final["mz"], DF_SIRIUS_final["RT"]):
        mass_delta = (abs(Pred_mz-mz)/Pred_mz)*1000000.0 if Pred_mz != 0 else 0
        if (Pred_rt >= rt-30.0) & (Pred_rt <= rt+30.0) & (mass_delta<= 20.0):
            hit = f"{name}"
            if hit not in hits:
                hits.append(hit)
    DF_features["SIRIUS_predictions"][i] = " ## ".join(hits)
DF_features

In [None]:
DF_features.to_csv(os.path.join("results", "annotations", "FeatureMatrix_SIRIUS_CSI.csv"), sep="\t", index= None)