In [1]:
import numpy as np
import pandas as pd
import glob
import os
from pyteomics import mgf, auxiliary
from pyopenms import *
import pyteomics
from pyteomics import mztab

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [None]:
path= os.path.join("results", "interim", "annotations")
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

First,import the feature table:

In [None]:
FeatureMatrix= os.path.join("results", "features", "FeatureMatrix.tsv")
DF_features= pd.read_csv(FeatureMatrix, sep="\t")
DF_features= DF_features.drop(columns=["charge", "quality"])
DF_features= DF_features.fillna(0)
DF_features["feature_ids"]= DF_features["feature_ids"].str.replace(r"[", "")
DF_features["feature_ids"]= DF_features["feature_ids"].str.replace(r"]", "")

for i, rows in DF_features.iterrows():
    DF_features["feature_ids"][i]= DF_features["feature_ids"][i].split(",")
DF_features

#### `1) SIRIUS and CSI:FingerID annotations`

Create a matrix with all SIRIUS and CSI:FingerID formula and structural predictions, only choose #1 rankings predictions and combine the dataframes to annotate formula and structural predictions according to RT and mz:

df_formulas will likely contain duplicate formulas that could be either isomeric, isobaric compounds, or identical compounds (with identical RT and mz). Here, we want to collapse the identical, repeating compounds

In [None]:
input_formulas= glob.glob(os.path.join("results", "SiriusCSI", "formulas_*.tsv"))
DF_SIRIUS = pd.DataFrame()
list_of_df=[]
for csv in input_formulas:
    df= pd.read_csv(csv, sep="\t", index_col="Unnamed: 0")
    s= df["opt_global_rank"]
    pd.to_numeric(s)
    df= df.loc[df["opt_global_rank"]==1]
    df= df.rename(columns={"opt_global_featureId":"featureId"})
    df= df.drop(columns=df.filter(regex=fr"Score").columns)
    df= df.drop(columns= df.filter(regex=fr"opt").columns)
    df=df.reset_index()
    list_of_df.append(df)
DF_SIRIUS= pd.concat(list_of_df,ignore_index=True)
DF_SIRIUS= DF_SIRIUS.drop(columns="index")
DF_SIRIUS= DF_SIRIUS.rename(columns= {"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT"})
DF_SIRIUS["featureId"]= DF_SIRIUS["featureId"].str.replace(r"id_", "")
DF_SIRIUS

Repeat for structral predictions (remove duplicates with the same inchi_keys, which means they represent the same structure):

In [None]:
input_structures= glob.glob(os.path.join("results", "SiriusCSI", "structures_*.tsv"))
DF_CSI = pd.DataFrame()
list_of_df=[]
for csv in input_structures:
    df= pd.read_csv(csv, sep="\t", index_col="Unnamed: 0")
    s= df["opt_global_rank"]
    pd.to_numeric(s)
    df= df.loc[df["opt_global_rank"]==1]
    df= df.rename(columns={"opt_global_featureId":"featureId"})
    df= df.drop(columns=df.filter(regex=fr"Score").columns)
    df= df.drop(columns= df.filter(regex=fr"opt").columns)
    df=df.reset_index()
    list_of_df.append(df)
DF_CSI= pd.concat(list_of_df,ignore_index=True)
DF_CSI= DF_CSI.drop(columns="index")
DF_CSI= DF_CSI.rename(columns= {"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT", "description":"name"})
DF_CSI["featureId"]= DF_CSI["featureId"].str.replace(r"id_", "")
DF_CSI

Annotate the formulas and structural predictions to the feature matrix according to SIRIUS and CSI:

In [None]:
DF_features.insert(0, "SIRIUS_predictions", "")

for i, id in zip(DF_features.index, DF_features["feature_ids"]):
    hits = []
    for name, Pred_id in zip(DF_SIRIUS["formulas"], DF_SIRIUS["featureId"]): 
        if Pred_id in id:
            hit = f"{name}"
            if hit not in hits:
                hits.append(hit)
    DF_features["SIRIUS_predictions"][i] = " ## ".join(hits)
DF_features

In [None]:
DF_features.insert(0, "CSI_predictions_name", "")
DF_features.insert(0, "CSI_predictions_formula", "")
DF_features.insert(0, "CSI_predictions_smiles", "")

for i, id, sirius in zip(DF_features.index, DF_features["feature_ids"], DF_features["SIRIUS_predictions"]):
    hits1 = []
    hits2= []
    hits3=[]
    for name, formula, smiles, Pred_id in zip(DF_CSI["name"], DF_CSI["formulas"], DF_CSI["smiles"], DF_CSI["featureId"]): 
        if (Pred_id in id) & (formula in sirius):
                hit1 = f"{name}"
                hit2 = f"{formula}"
                hit3= f"{smiles}"
                if hit1 not in hits1:
                    hits1.append(hit1)
                    hits2.append(hit2)
                    hits3.append(hit3)
    DF_features["CSI_predictions_name"][i] = " ## ".join(hits1)
    DF_features["CSI_predictions_formula"][i] = " ## ".join(hits2)
    DF_features["CSI_predictions_smiles"][i] = " ## ".join(hits3)
DF_features

In [None]:
DF_features.to_csv(os.path.join("results", "annotations", "SiriusCSI_annotated_FeatureMatrix.tsv"), sep="\t", index= None)

#### `2) Spectral matching`

This step matches spectra with an MGF library and annotates the feature matrix:

In [None]:
DF_features= pd.read_csv(os.path.join("results", "annotations", "SiriusCSI_annotated_FeatureMatrix.tsv"), sep="\t") #here we import the already annotated with sirius and csi predictions feature matrix
DF_features

Load the MGF file to an MSExperiment format:

In [None]:
mgf_file= os.path.join("results", "GNPSexport", "MSMS.mgf")
exp = MSExperiment()
MascotGenericFile().load(mgf_file, exp)

Perform spectral matching with a library in MGF format that is located under "resources":

In [None]:
database= os.path.join("resources", "GNPS-LIBRARY.mgf")
speclib = MSExperiment()
MascotGenericFile().load(database, speclib)
mztab= MzTab()
output_mztab= os.path.join("results", "interim", "annotations", "MSMS.mzTab")
out_merged= ""
MSMS_match= MetaboliteSpectralMatching()
MSMS_match_par = MSMS_match.getDefaults()
MSMS_match_par.setValue('merge_spectra', 'false')
MSMS_match.setParameters(MSMS_match_par)
MSMS_match.run(exp, speclib, mztab,  String(out_merged))
MzTabFile().store(output_mztab, mztab)

Clean up the mzTab to a dataframe:

In [None]:
spectralmatch=  pyteomics.mztab.MzTab(output_mztab, encoding="UTF8", table_format="df")
spectralmatch.metadata
df= spectralmatch.small_molecule_table
spectralmatch_DF= df.drop(columns= ["identifier", "inchi_key", "modifications", "calc_mass_to_charge", "opt_adduct_ion", "taxid", "species", "database", "spectra_ref", "search_engine", "opt_sec_id","smallmolecule_abundance_std_error_study_variable[1]", "smallmolecule_abundance_stdev_study_variable[1]", "smallmolecule_abundance_study_variable[1]", "chemical_formula"])
spectralmatch_DF=spectralmatch_DF[spectralmatch_DF["opt_ppm_error"] <= 10] 
spectralmatch_DF=spectralmatch_DF[spectralmatch_DF["opt_ppm_error"] >= -10]
spectralmatch_DF=spectralmatch_DF[spectralmatch_DF["opt_match_score"] >= 60]
spectralmatch_DF["opt_spec_native_id"]= spectralmatch_DF["opt_spec_native_id"].str.replace(r"index=", "")
spectralmatch_DF       

Annotate the feature matrix with the spectral matches:

In [None]:
path= os.path.join("results", "GNPSexport", "MSMS.mgf")
file= mgf.MGF(source=path, use_header=True, convert_arrays=2, read_charges=True, read_ions=False, dtype=None, encoding=None)
parameters=[]
for spectrum in file:
    parameters.append(spectrum['params'])
mgf_file= pd.DataFrame(parameters)
mgf_file["feature_id"]= mgf_file["feature_id"].str.replace(r"e_", "")

df= exp.get_df() #mzML loaded in MSExperiment()
for spec in exp:
    df["index"]= [spec.getNativeID() for spec in exp]
    df["SCANS"]= [spec.getMetaValue("Scan_ID") for spec in exp]
df["index"]= df["index"].str.replace(r"index=", "")

spectralmatch_DF.insert(0, "SCANS", "")

for i, idx in zip(spectralmatch_DF.index, spectralmatch_DF["opt_spec_native_id"]):
    hits = []
    for index, scan_number, in zip(df["index"], df["SCANS"]):
        if idx==index:
            hit = f"{scan_number}"
            if hit not in hits:
                hits.append(hit)
    spectralmatch_DF["SCANS"][i] = " ## ".join(hits)

DF_features["id"]= DF_features["id"].astype(str)
DF_features["feature_ids"]= DF_features["feature_ids"].values.tolist()
DF_features.insert(0, "SCANS", "")
for i, id in zip(DF_features.index, DF_features["id"]):
    hits = []
    for scan, feature_id in zip(mgf_file["scans"], mgf_file["feature_id"]): 
        if feature_id==id:
            hit = f"{scan}"
            if hit not in hits:
                hits.append(hit)
    DF_features["SCANS"][i] = " ## ".join(hits)

DF_features.insert(0, "SpectralMatch", "")
DF_features.insert(0, "SpectralMatch_smiles", "")

for i, scan in zip(DF_features.index, DF_features["SCANS"]):
    hits1 = []
    hits2=[]
    for name, smiles, scan_number, in zip(spectralmatch_DF["description"],spectralmatch_DF["smiles"], spectralmatch_DF["SCANS"]):
        if scan==scan_number:
            hit1 = f"{name}"
            hit2 = f"{smiles}"
            if hit1 not in hits1:
                hits1.append(hit1)
                hits2.append(hit2)
    DF_features["SpectralMatch"][i] = " ## ".join(hits1)
    DF_features["SpectralMatch_smiles"][i] = " ## ".join(hits2)
DF_features.to_csv(os.path.join("results", "annotations", "SiriusCSI_MSMS_annotated_FeatureMatrix.tsv"), sep="\t", index = False)