In [None]:
import numpy as np
import pandas as pd
import glob
import os

In [None]:
path= os.path.join("results", "interim", "analysis")
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

First,import the feature table:

In [None]:
FeatureMatrix= os.path.join("results", "features", "FeatureMatrix.tsv")
DF_features= pd.read_csv(FeatureMatrix, sep="\t")
DF_features=DF_features.set_index(["mz", "RT"])
DF_features= DF_features.drop(columns=["charge", "quality", "id"])
DF_features= DF_features.fillna(0)
DF_features["feature_ids"]= [ids[1:-1].split(",") for ids in DF_features["feature_ids"]]
DF_features

In [None]:
DF_features= DF_features.reset_index()
DF_features

#### `1) GNPS annotations`

This step requires an independent job at GNPS Spectral Library Search (input: raw mzML files)
See documentation: https://ccms-ucsd.github.io/GNPSDocumentation/librarysearch/ 

When the job is finished, import all identifications from GNPS, save the .TSV table under the resources directory and "clean up" the table:

In [None]:
df= pd.read_csv(os.path.join("resources", "MS2_LIBRARYSEARCH_all_identifications.tsv"), sep="\t", encoding="latin-1")
df.drop(df.index[df["IonMode"] == "negative"], inplace=True)
df.drop(df.index[df["MZErrorPPM"] > 20.0], inplace=True)
GNPS=df.filter(["Compound_Name", "RT_Query", "Precursor_MZ"])
GNPS=GNPS.rename(columns= {"RT_Query": "RetentionTime"})
GNPS=GNPS.drop_duplicates(subset="Compound_Name", keep="first")
GNPS

Annotate the features detected by GNPS according to mz and RT (mz tolerance 10 ppm and RT tolerance 20 seconds: instrument and method-dependent)

In [None]:
path= os.path.join("results", "annotations")
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

In [None]:
DF_features.insert(0, "GNPS_IDs", "")

for i, mz, rt in zip(DF_features.index, DF_features["mz"], DF_features["RT"]):
    hits = []
    for name, GNPS_mz, GNPS_rt, in zip(GNPS["Compound_Name"], GNPS["Precursor_MZ"], GNPS["RetentionTime"]):
        mass_delta = (abs(GNPS_mz-mz)/GNPS_mz)*1000000.0 if GNPS_mz != 0 else np.nan
        if (GNPS_rt >= rt-20.0) & (GNPS_rt <= rt+20.0) & (mass_delta<= 10.0):
            hit = f"{name}"
            if hit not in hits:
                hits.append(hit)
    DF_features["GNPS_IDs"][i] = " ## ".join(hits)

DF_features.to_csv(os.path.join(path, "GNPS_annotated_feature_matrix.tsv"), sep="\t", index = False)
DF_features

Keep the unannotated features only

In [None]:
FeatureMatrix= DF_features[DF_features.GNPS_IDs == ""]
FeatureMatrix= FeatureMatrix.drop(columns= "GNPS_IDs")
FeatureMatrix= FeatureMatrix.set_index(["RT", "mz"])
FeatureMatrix_tocsv= FeatureMatrix.reset_index()
FeatureMatrix_tocsv.to_csv(os.path.join("results", "annotations", "FeatureMatrix_unknowns.tsv"), sep="\t", index =None)
FeatureMatrix

#### `2) SIRIUS and CSI:FingerID annotations`

Create a matrix with all SIRIUS and CSI:FingerID formula and structural predictions, only choose #1 rankings predictions and combine the dataframes to annotate formula and structural predictions according to RT and mz:

df_formulas will likely contain duplicate formulas that could be either isomeric, isobaric compounds, or identical compounds (with identical RT and mz). Here, we want to collapse the identical, repeating compounds

In [None]:
input_formulas= glob.glob(os.path.join("results", "Sirius", "formulas_*.tsv"))
DF_SIRIUS = pd.DataFrame()
list_of_df=[]
for csv in input_formulas:
    df= pd.read_csv(csv, sep="\t", index_col="Unnamed: 0")
    s= df["opt_global_rank"]
    pd.to_numeric(s)
    df= df.loc[df["opt_global_rank"]==1]
    df= df.rename(columns={"opt_global_featureId":"featureId"})
    df= df.drop(columns=df.filter(regex=fr"Score").columns)
    df= df.drop(columns= df.filter(regex=fr"opt").columns)
    df=df.reset_index()
    list_of_df.append(df)
DF_SIRIUS= pd.concat(list_of_df,ignore_index=True)
DF_SIRIUS= DF_SIRIUS.drop(columns="index")
DF_SIRIUS= DF_SIRIUS.rename(columns= {"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT"})
DF_SIRIUS["featureId"]= DF_SIRIUS["featureId"].str.replace(r"id_", "")
for i, rows in DF_SIRIUS.iterrows():
    DF_SIRIUS["featureId"][i]= DF_SIRIUS["featureId"][i].split(",")
DF_SIRIUS.to_csv(os.path.join("results", "annotations", "SIRIUS_library.tsv"), sep="\t", index=None)
DF_SIRIUS

Repeat for structral predictions (remove duplicates with the same inchi_keys, which means they represent the same structure):

In [None]:
input_structures= glob.glob(os.path.join("results", "Sirius", "structures_*.tsv"))
DF_CSI = pd.DataFrame()
list_of_df=[]
for csv in input_structures:
    df= pd.read_csv(csv, sep="\t", index_col="Unnamed: 0")
    s= df["opt_global_rank"]
    pd.to_numeric(s)
    df= df.loc[df["opt_global_rank"]==1]
    df= df.rename(columns={"opt_global_featureId":"featureId"})
    df= df.drop(columns=df.filter(regex=fr"Score").columns)
    df= df.drop(columns= df.filter(regex=fr"opt").columns)
    df=df.reset_index()
    list_of_df.append(df)
DF_CSI= pd.concat(list_of_df,ignore_index=True)
DF_CSI= DF_CSI.drop(columns="index")
DF_CSI= DF_CSI.rename(columns= {"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT", "description":"name"})
DF_CSI["featureId"]= DF_CSI["featureId"].str.replace(r"id_", "")
for i, rows in DF_CSI.iterrows():
    DF_CSI["featureId"][i]= DF_CSI["featureId"][i].split(",")
DF_CSI.to_csv(os.path.join("results", "annotations", "CSI_library.tsv"), sep="\t", index=None)
DF_CSI

Annotate the formulas and structural predictions to the feature matrix according to SIRIUS and CSI:

In [None]:
DF_features.insert(0, "SIRIUS_predictions", "")

for i, id in zip(DF_features.index, DF_features["feature_ids"]):
    hits = []
    for name, Pred_id in zip(DF_SIRIUS["formulas"], DF_SIRIUS["featureId"]): 
        for x,y in zip(id,Pred_id):
            if x==y:
                hit = f"{name}"
                if hit not in hits:
                    hits.append(hit)
    DF_features["SIRIUS_predictions"][i] = " ## ".join(hits)
DF_features

In [None]:
DF_features.insert(0, "CSI_predictions_name", "")
DF_features.insert(0, "CSI_predictions_formula", "")
DF_features.insert(0, "CSI_predictions_smiles", "")

for i, id, sirius in zip(DF_features.index, DF_features["feature_ids"], DF_features["SIRIUS_predictions"]):
    hits1 = []
    hits2= []
    hits3=[]
    for name, formula, smiles, Pred_id in zip(DF_CSI["name"], DF_CSI["formulas"], DF_CSI["smiles"], DF_CSI["featureId"]): 
        for x,y in zip(id,Pred_id):
            if (x==y)& (formula in sirius):
                hit1 = f"{name}"
                hit2 = f"{formula}"
                hit3= f"{smiles}"
                if hit1 not in hits1:
                    hits1.append(hit1)
                    hits2.append(hit2)
                    hits3.append(hit3)
    DF_features["CSI_predictions_name"][i] = " ## ".join(hits1)
    DF_features["CSI_predictions_formula"][i] = " ## ".join(hits2)
    DF_features["CSI_predictions_smiles"][i] = " ## ".join(hits3)
DF_features

In [None]:
DF_features.to_csv(os.path.join("results", "annotations", "FeatureMatrix_SIRIUS_CSI.csv"), sep="\t", index= None)