In [None]:
import numpy as np
import pandas as pd
import glob
import os

First,import the feature table:

In [None]:
FeatureMatrix= os.path.join("results", "features", "FeatureMatrix_Requantified.tsv")
DF_features= pd.read_csv(FeatureMatrix, sep="\t")
DF_features=DF_features.set_index(["mz", "RT"])
DF_features= DF_features.drop(columns=["charge", "quality", "id"])
DF_features= DF_features.fillna(0)
DF_features["id_list"]= DF_features["id_list"].str.replace(r"{|}|'", "")
for i, rows in DF_features.iterrows():
    DF_features["id_list"][i]= DF_features["id_list"][i].split(",")
DF_features

#### `1) Filter the feature matrix (optional)`

In [None]:
path= os.path.join("results", "interim", "analysis")
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

1) a) Remove all features detected in negative controls (make sure there is no cross-contamination)

In [None]:
media_list= [r"ISP2", r"FPY12", r"DNPM"] # different media/conditions (treatments)
for medium in media_list:
    Features_flt=DF_features.filter(regex=fr"{medium}")
    blanks= Features_flt.filter(regex="blank", axis= 1) 
    blanks = blanks.fillna(0)
    blanks= blanks.transpose()
    dictionary = {}
    cols= blanks.columns
    for i, col in enumerate(cols):
        dictionary[i] = np.count_nonzero(blanks[col]) / len(blanks[col])
    column_idx = [key for key, value in dictionary.items() if value >= 0.5] #Remove features that appear most frequently (in more than 50% of the samples) in the negative controls
    print(dictionary)
    blank_features= blanks.iloc[:, column_idx] 
    cols= blank_features.columns
    Features_flt= Features_flt.transpose()
    Features_nb= Features_flt.drop(columns= cols)
    Features_nb= Features_nb.dropna(how="all")
    blanks=blanks.transpose()
    blank_cols= blanks.columns
    Features_nb= Features_flt.drop(columns=blank_cols)
    filename= os.path.join(path, "No_NC_"+ medium + "_DF_features.csv")
    Features_nb.to_csv(filename, sep="\t")

1) b) Or if there are multiple replicates, remove only the features detected in more than 50% of all the negative controls (or blanks)

In [None]:
media_list= [r"ISP2", r"FPY12", r"DNPM"] # different media/conditions (treatments)
for medium in media_list:
    Features_flt=DF_features.filter(regex=fr"{medium}")
    blanks= Features_flt.filter(regex="blank", axis= 1) 
    blanks = blanks.fillna(0)
    blanks= blanks.transpose()
    cols= blanks.columns
    Features_flt= Features_flt.transpose()
    Features_nb= Features_flt.drop(columns= cols)
    Features_nb= Features_nb.dropna(how="all")
    blanks=blanks.transpose()
    blank_cols= blanks.columns
    Features_flt= Features_flt.transpose()
    Features_nb= Features_flt.drop(columns=blank_cols)
    filename= os.path.join(path, "No_NC_"+ medium + "_DF_features.csv")
    Features_nb.to_csv(filename, sep="\t")

2. Replace the features that have lower intensity than 10^4 with NaN (noise for Orbitrap instruments)

In [None]:
def noise_flt(csvfile):   
    Features= pd.read_csv(csvfile, sep="\t")
    Features= Features.set_index(["mz", "RT"])
    Features= Features.sort_index(axis=1) 
    cols= Features.columns
    Features[cols] = Features[cols].replace({0:np.nan})
    Features[Features<10000] = np.nan
    Featuresnew=Features.dropna(how="all")
    Featuresnew = Featuresnew.fillna(0)
    DF= Featuresnew.reset_index()
    file_path = os.path.join(os.path.dirname(csvfile), 'noise_thr_' + os.path.basename(csvfile)[6:])
    DF.to_csv(file_path, sep="\t")

In [None]:
csvfiles= glob.glob(os.path.join(path, "No_NC_*.csv"))
for csvfile in csvfiles:
    noise_flt(csvfile)

3) Replace the presence of a feature with NaN if the feature is present in only 1 out of 3 replicates

In [None]:
def rep_filter(csvfile):
    Features= pd.read_csv(csvfile, sep="\t", index_col="Unnamed: 0")
    Features= Features.set_index(["mz", "RT"])
    Features= Features.sort_index(axis=1)
    cols= Features.columns
    Features= Features.fillna(0)
    Features= Features.transpose()
    Features= Features.reset_index()
    Features['genomeID']=Features['index'].str.extract(r'(NBC_?\d*)')
    Features['genomeID_MDNA']=Features['index'].str.extract(r'(MDNAWGS?\d*|MDNA_WGS_?\d*)')
    Features['genomeID']=Features['genomeID'].fillna(Features['genomeID_MDNA'])
    Features= Features.drop(columns=["genomeID_MDNA"])
    Features=Features.set_index(["index"])
    Grouped= Features.groupby("genomeID")
    DF= Grouped.transform(lambda x: np.nan if np.count_nonzero(x)<2 else x)
    DF=DF.transpose()
    DF=DF.reset_index()
    file_path = os.path.join(os.path.dirname(csvfile), os.path.basename(csvfile)[10:])
    DF.to_csv(file_path, sep="\t", index=None)

In [None]:
csvfiles= glob.glob(os.path.join(path, "noise_thr_*.csv"))
for csvfile in csvfiles:
    rep_filter(csvfile)

- Merge all tables on mz and RT:

In [None]:
Matrix_FPY12= pd.read_csv(os.path.join(path, "FPY12_DF_features.csv"), sep="\t")
Matrix_ISP2= pd.read_csv(os.path.join(path, "ISP2_DF_features.csv"), sep="\t")
Matrix_DNPM= pd.read_csv(os.path.join(path, "DNPM_DF_features.csv"), sep="\t")

Matrix_ISP2= Matrix_ISP2.set_index(["mz", "RT"])
Matrix_ISP2= Matrix_ISP2.fillna(0)
Matrix_ISP2= Matrix_ISP2.sort_index(axis=1)

Matrix_FPY12= Matrix_FPY12.set_index(["mz", "RT"])
Matrix_FPY12= Matrix_FPY12.sort_index(axis=1)
Matrix_FPY12= Matrix_FPY12.fillna(0)

Matrix_DNPM= Matrix_DNPM.set_index(["mz", "RT"])
Matrix_DNPM= Matrix_DNPM.fillna(0)
Matrix_DNPM= Matrix_DNPM.sort_index(axis=1)

Matrix_ISP2_FPY12= pd.merge(Matrix_FPY12, Matrix_ISP2, on=["mz", "RT"], how="outer")
Matrix= pd.merge(Matrix_ISP2_FPY12, Matrix_DNPM, on=["mz", "RT"],how= "outer")
cols= Matrix.columns
Matrix[cols] = Matrix[cols].replace({0:np.nan})
Matrix= Matrix.dropna(how="all")
Matrix= Matrix.reset_index()
Matrix.to_csv(os.path.join(path, "Matrix_Clean.csv"), sep="\t", index=None)

#### `2) GNPS annotations`

This step requires an independent job at GNPS Spectral Library Search (input: raw mzML files)
See documentation: https://ccms-ucsd.github.io/GNPSDocumentation/librarysearch/ 

When the job is finished, import all identifications from GNPS, save the .TSV table under the resources directory and "clean up" the table:

In [None]:
df= pd.read_csv(os.path.join("resources", "MS2_LIBRARYSEARCH_all_identifications.tsv"), sep="\t", encoding="latin-1")
df.drop(df.index[df["IonMode"] == "negative"], inplace=True)
df.drop(df.index[df["MZErrorPPM"] > 20.0], inplace=True)
GNPS=df.filter(["Compound_Name", "RT_Query", "Precursor_MZ"])
GNPS=GNPS.rename(columns= {"RT_Query": "RetentionTime"})
GNPS=GNPS.drop_duplicates(subset="Compound_Name", keep="first")
GNPS

In [None]:
FeatureMatrix= pd.read_csv(os.path.join(path, "Matrix_Clean.csv"), sep="\t")
FeatureMatrix

Annotate the features detected by GNPS according to mz and RT (mz tolerance 10 ppm and RT tolerance 20 seconds: instrument and method-dependent)

In [None]:
path= os.path.join("results", "annotations")
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

In [None]:
FeatureMatrix.insert(0, "GNPS_IDs", "")

for i, mz, rt in zip(FeatureMatrix.index, FeatureMatrix["mz"], FeatureMatrix["RT"]):
    hits = []
    for name, GNPS_mz, GNPS_rt, in zip(GNPS["Compound_Name"], GNPS["Precursor_MZ"], GNPS["RetentionTime"]):
        mass_delta = (abs(GNPS_mz-mz)/GNPS_mz)*1000000.0 if GNPS_mz != 0 else np.nan
        if (GNPS_rt >= rt-20.0) & (GNPS_rt <= rt+20.0) & (mass_delta<= 10.0):
            hit = f"{name}"
            if hit not in hits:
                hits.append(hit)
    FeatureMatrix["GNPS_IDs"][i] = " ## ".join(hits)

FeatureMatrix.to_csv(os.path.join(path, "GNPS_annotated_feature_matrix.tsv"), sep="\t", index = False)
FeatureMatrix

Keep the unannotated features only

In [None]:
FeatureMatrix= FeatureMatrix[FeatureMatrix.GNPS_IDs == ""]
FeatureMatrix= FeatureMatrix.drop(columns= "GNPS_IDs")
FeatureMatrix= FeatureMatrix.set_index(["RT", "mz"])
FeatureMatrix_tocsv= FeatureMatrix.reset_index()
FeatureMatrix_tocsv.to_csv(os.path.join("results", "annotations", "FeatureMatrix_unknowns.tsv"), sep="\t", index =None)
FeatureMatrix

#### `3) SIRIUS and CSI:FingerID annotations`

Create a matrix with all SIRIUS and CSI:FingerID formula and structural predictions, only choose #1 rankings predictions and combine the dataframes to annotate formula and structural predictions according to RT and mz:

In [None]:
input_formulas = glob.glob(os.path.join("results", "Sirius", "formulas_*.csv"))
input_structures = glob.glob(os.path.join("results", "Sirius", "structures_*.csv"))

df_formulas will likely contain duplicate formulas that could be either isomeric, isobaric compounds, or identical compounds (with identical RT and mz). Here, we want to collapse the identical, repeating compounds

In [None]:
DF_SIRIUS = pd.DataFrame()
list_of_df=[]
for csv in input_formulas:
    df= pd.read_csv(csv, sep=",", index_col="Unnamed: 0")
    s= df["opt_global_rank"]
    pd.to_numeric(s)
    df= df.loc[df["opt_global_rank"]==1]
    df= df.rename(columns={"opt_global_featureId":"featureId"})
    df_score=df.filter(regex=fr"Score")
    df_opt=df.filter(regex=fr"opt")
    cols_score= df_score.columns
    cols_opt= df_opt.columns
    df= df.drop(columns=cols_score)
    df= df.drop(columns= cols_opt)
    df=df.reset_index()
    list_of_df.append(df)
DF_SIRIUS= pd.concat(list_of_df,ignore_index=True)
DF_SIRIUS= DF_SIRIUS.drop(columns="index")
df_formulas= DF_SIRIUS.rename(columns= {"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT"})
df_formulas = df_formulas.set_index("formulas")
df_singletons=df_formulas.reset_index().drop_duplicates(subset="formulas", keep=False)

df_singletons= df_singletons.set_index("formulas")
idx= df_singletons.index
df_sirius= df_formulas.drop(idx) #drop the singletons and keep the duplicated formulas
new_df= pd.DataFrame() #create new, empty DF
df= pd.DataFrame() #create new, empty DF
idx= df_sirius.index #index of DF with duplicates
for i, index in enumerate(idx): #parse through the index
    new_index= new_df.index #create a new index
    if index not in new_index: #if the old index is not already appended in the new one:
        s= df_sirius.iloc[i] #get the whole row of the old DF
        new_df= new_df.append(s) #and append it to the new DF
    else: #if it already is then check if the features are identical (Delta mz<10ppm, Delta RT+- 30s)
        for j, mz_1, time_1 in zip(new_df.index, new_df["mz"], new_df["RT"]):
            ids=[]
            for mz_0, time_0, id_0 in zip(df_sirius["mz"], df_sirius["RT"], df_sirius["featureId"]):
                mass_delta = (abs(mz_0 - mz_1)/mz_0)*1000000
                maxdeltaRT = time_0 + 30.0
                mindeltaRT = time_0 - 30.0
                if (mindeltaRT<= time_1 <= maxdeltaRT) & (mass_delta<= 10.0):
                    id= id_0
                    if id not in ids:
                        ids.append(id)
            new_df["featureId"][j] = " , ".join(ids) #if they are identical, append the feature ids only under the featureId column 
        else: #if they are not identical
            m= df_sirius.iloc[i]
            df= df.append(m)

DF_SIRIUS= pd.concat([new_df, df], axis=0)
DF_SIRIUS_final= pd.concat([DF_SIRIUS, df_singletons], axis=0)
DF_SIRIUS_final= DF_SIRIUS_final.reset_index()
DF_SIRIUS_final= DF_SIRIUS_final.rename(columns={"index":"formulas"})
DF_SIRIUS_final["featureId"]= DF_SIRIUS_final["featureId"].str.replace(r"id_", "")
for i, rows in DF_SIRIUS_final.iterrows():
    DF_SIRIUS_final["featureId"][i]= DF_SIRIUS_final["featureId"][i].split(",")
DF_SIRIUS_final.to_csv(os.path.join("results", "annotations", "SIRIUS_library.csv"), sep="\t", index=None)
DF_SIRIUS_final

Repeat for structral predictions (remove duplicates with the same inchi_keys, which means they represent the same structure):

In [None]:
DF_CSI= []
for i, formulas in enumerate(input_structures):
    df= pd.read_csv(formulas, index_col="Unnamed: 0")
    df= df.loc[df["opt_global_rank"]==1]
    df_score=df.filter(regex=fr"best_search_engine_score")
    df= df.rename(columns={"opt_global_featureId":"featureId"})
    df_opt=df.filter(regex=fr"opt")
    cols_score= df_score.columns
    cols_opt= df_opt.columns
    df= df.drop(columns=cols_score)
    df= df.drop(columns= cols_opt)
    df= df.drop(columns= "identifier")
    df=df.reset_index()
    df= df.drop(columns="index")
    DF_CSI.append(df)


df_structures= pd.concat(DF_CSI, axis=0).sort_values("chemical_formula")
df_structures = df_structures.drop_duplicates(subset=["inchi_key"], keep="first")
df_structures= df_structures.drop(columns=["inchi_key"]) #leave smiles for visualisationdf_structures= df_structures.rename(columns={"chemical_formula": "formulas", "exp_mass_to_charge": "mz", "retention_time": "RT"})
df_structures= df_structures.rename(columns={"chemical_formula":"formulas"})
df_structures= df_structures.set_index("formulas")
df_singletons=df_structures.reset_index().drop_duplicates(subset="formulas", keep=False)
df_singletons= df_singletons.set_index("formulas")
idx= df_singletons.index
df_CSI= df_structures.drop(labels=idx, axis=0)
new_df= pd.DataFrame()
df= pd.DataFrame()
idx= df_CSI.index
for i, index in enumerate(idx):
    new_index= new_df.index
    if index not in new_index:
        s= df_CSI.iloc[i]
        new_df= new_df.append(s)
    else: #if it already is then check if the features are identical (Delta mz<10ppm, Delta RT+- 30s)
        for j, mz_1, time_1 in zip(new_df.index, new_df["mz"], new_df["RT"]):
            ids=[]
            for mz_0, time_0, id_0 in zip(df_CSI["mz"], df_CSI["RT"], df_CSI["featureId"]):
                mass_delta = (abs(mz_0 - mz_1)/mz_0)*1000000
                maxdeltaRT = time_0 + 30.0
                mindeltaRT = time_0 - 30.0
                if (mindeltaRT<= time_1 <= maxdeltaRT) & (mass_delta<= 10.0):
                    id= id_0
                    if id not in ids:
                        ids.append(id)
            new_df["featureId"][j] = " , ".join(ids) #if they are identical, append the feature ids only under the featureId column 
        else: #if they are not identical
            m= df_CSI.iloc[i]
            df= df.append(m)

DF_CSI= pd.concat([new_df, df], axis=0)
DF_CSI_final= pd.concat([DF_CSI, df_singletons], axis=0)
DF_CSI_final= DF_CSI_final.reset_index()
DF_CSI_final= DF_CSI_final.rename(columns={"index":"formulas"})
DF_CSI_final["featureId"]= DF_CSI_final["featureId"].str.replace(r"id_", "")
for i, rows in DF_CSI_final.iterrows():
    DF_CSI_final["featureId"][i]= DF_CSI_final["featureId"][i].split(",")
DF_CSI_final.to_csv(os.path.join("results", "annotations", "CSI_library.csv"), sep="\t", index= None)
DF_CSI_final

In [None]:
DF_features= DF_features.reset_index()
DF_features

Annotate the formulas and structural predictions to the feature matrix according to SIRIUS and CSI:

In [None]:
DF_features.insert(0, "CSI_predictions_name", "")
DF_features.insert(0, "CSI_predictions_formula", "")
DF_features.insert(0, "CSI_predictions_smiles", "")


for i, id in zip(DF_features.index, DF_features["id_list"]):
    hits1 = []
    hits2= []
    hits3=[]
    for name, smiles, formula, CSI_ids in zip(DF_CSI_final["description"], DF_CSI_final["smiles"], DF_CSI_final["formulas"], DF_CSI_final["featureId"]):
        for x in id:
            if x in CSI_ids:
                hit1 = f"{name}"
                hit2 = f"{formula}"
                hit3= f"{smiles}"
                if hit1 not in hits1:
                    hits1.append(hit1)
                    hits2.append(hit2)
                    hits3.append(hit3)
    DF_features["CSI_predictions_name"][i] = " ## ".join(hits1)
    DF_features["CSI_predictions_formula"][i] = " ## ".join(hits2)
    DF_features["CSI_predictions_smiles"][i] = " ## ".join(hits3)
DF_features

In [None]:
DF_features.insert(0, "SIRIUS_predictions", "")

for i, id in zip(DF_features.index, DF_features["id_list"]):
    hits = []
    for name, Pred_id in zip(DF_SIRIUS_final["formulas"], DF_SIRIUS_final["featureId"]): 
        for x in id:
            if x in Pred_id:
                hit = f"{name}"
                if hit not in hits:
                    hits.append(hit)
        DF_features["SIRIUS_predictions"][i] = " ## ".join(hits)
DF_features

In [None]:
DF_features.to_csv(os.path.join("results", "annotations", "FeatureMatrix_SIRIUS_CSI.csv"), sep="\t", index= None)