## LOOP THROUGH DIRECTORY and run pyOpenMS workflow 

In [1]:
from pyopenms import *

def pyopenms_WF(filename):
    exp = MSExperiment()
    MzMLFile().load(filename, exp)
    exp.sortSpectra(True)
    
    mass_traces = []
    mtd = MassTraceDetection()
    mtd_par = mtd.getDefaults()
    mtd_par.setValue("mass_error_ppm", 10.0) 
    mtd_par.setValue("noise_threshold_int", 1.0e04)
    mtd.setParameters(mtd_par)
    mtd.run(exp, mass_traces, 0)
    
    mass_traces_split = []
    mass_traces_final = []
    epd = ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue("width_filtering", "fixed")
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_split)
    
    if (epd.getParameters().getValue("width_filtering") == "auto"):
        epd.filterByPeakWidth(mass_traces_split, mass_traces_final)
    else:
        mass_traces_final = mass_traces_split
        
    feature_map_FFM = FeatureMap()
    feat_chrom = []
    ffm = FeatureFindingMetabo()
    ffm_par = ffm.getDefaults() 
    ffm_par.setValue("isotope_filtering_model", "none")
    ffm_par.setValue("remove_single_traces", "true")
    ffm_par.setValue("mz_scoring_by_elements", "true")
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_final, feature_map_FFM, feat_chrom)
    feature_map_FFM.setUniqueIds()
    fh = FeatureXMLFile()
    fh.store('./wf_testing/FeatureFindingMetabo.featureXML', feature_map_FFM)
    
    mfd = MetaboliteFeatureDeconvolution()
    mdf_par = mfd.getDefaults()
    mdf_par.setValue("potential_adducts",  [b"H:+:0.6",b"Na:+:0.2",b"NH4:+:0.1", b"H2O:-:0.1"])
    mdf_par.setValue("charge_min", 1, "Minimal possible charge")
    mdf_par.setValue("charge_max", 1, "Maximal possible charge")
    mdf_par.setValue("charge_span_max", 1)
    mdf_par.setValue("max_neutrals", 1)
    mfd.setParameters(mdf_par)
    
    feature_map_DEC = FeatureMap()
    cons_map0 = ConsensusMap()
    cons_map1 = ConsensusMap()
    mfd.compute(feature_map_FFM, feature_map_DEC, cons_map0, cons_map1)
    fxml = FeatureXMLFile()
    fxml.store("./wf_testing/deconvoluted.featureXML", feature_map_DEC)
    
    delta_mzs= []
    mzs = []
    rts= []
    PrecursorCorrection.correctToHighestIntensityMS1Peak(exp, 100.0, True, delta_mzs, mzs, rts)
    
    sirius_algo = SiriusAdapterAlgorithm()
    sirius_algo_par = sirius_algo.getDefaults()
    sirius_algo_par.setValue("preprocessing:filter_by_num_masstraces", 2) 
    sirius_algo_par.setValue("preprocessing:precursor_mz_tolerance", 10.0)
    sirius_algo_par.setValue("preprocessing:precursor_mz_tolerance_unit", "ppm")
    sirius_algo_par.setValue("preprocessing:precursor_rt_tolerance", 5.0)
    sirius_algo_par.setValue("preprocessing:feature_only", "true")
    sirius_algo_par.setValue("sirius:profile", "orbitrap")
    sirius_algo_par.setValue("sirius:db", "all")
    sirius_algo_par.setValue("sirius:ions_considered", "[M+H]+, [M-H2O+H]+, [M+Na]+, [M+NH4]+")
    sirius_algo_par.setValue("sirius:candidates", 5)
    sirius_algo_par.setValue("sirius:elements_enforced", "CHN[30]OP") 
    sirius_algo_par.setValue("project:processors", 2)
    sirius_algo.setParameters(sirius_algo_par)
    
    featureinfo = "./wf_testing/deconvoluted.featureXML"
    fm_info = FeatureMapping_FeatureMappingInfo()
    feature_mapping = FeatureMapping_FeatureToMs2Indices() 
    sirius_algo.preprocessingSirius(featureinfo,
                                    exp,
                                    fm_info,
                                    feature_mapping)
    sirius_algo.logFeatureSpectraNumber(featureinfo, 
                                    feature_mapping,
                                    exp)
    msfile = SiriusMSFile()
    debug_level = 10
    sirius_tmp = SiriusTemporaryFileSystemObjects(debug_level)
    siriusstring= String(sirius_tmp.getTmpMsFile())
    feature_only = sirius_algo.isFeatureOnly()
    isotope_pattern_iterations = sirius_algo.getIsotopePatternIterations()
    no_mt_info = sirius_algo.isNoMasstraceInfoIsotopePattern()
    compound_info = []
    msfile.store(exp, 
                 String(sirius_tmp.getTmpMsFile()),
                 feature_mapping, 
                 feature_only,
                 isotope_pattern_iterations, 
                 no_mt_info, 
                 compound_info)
    out_csifingerid = "./wf_testing/csifingerID.mzTab" 
    executable= "/Users/eeko/Desktop/software/THIRDPARTY/MacOS/64bit/Sirius/sirius"
    subdirs = sirius_algo.callSiriusQProcess(String(sirius_tmp.getTmpMsFile()),
                                             String(sirius_tmp.getTmpOutDir()),
                                             String(executable),
                                             String(out_csifingerid),
                                             False)
    candidates = sirius_algo.getNumberOfSiriusCandidates()
    sirius_result = MzTab()
    siriusfile = MzTabFile()
    SiriusMzTabWriter.read(subdirs,
                            input_mzML,
                            candidates,
                            sirius_result)
    siriusfile.store("./wf_testing/out_sirius_test.mzTab", sirius_result)
    top_hits= 5
    csi_result=MzTab()
    csi_file=MzTabFile()
    CsiFingerIdMzTabWriter.read(subdirs,
                        input_mzML,
                        top_hits,
                        csi_result)
    csi_file.store("./wf_testing/csifingerID.mzTab", csi_result)
    return "./wf_testing/out_sirius_test.mzTab"

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [None]:
import os
directory= "./"
for filename in os.listdir(directory):
    if filename.endswith(".mzML"): 
        pyopenms_WF(filename)
        print(os.path.join(directory, filename))
        continue
    else:
        continue

In [59]:
from pandas import DataFrame
import pandas as pd

import pyteomics
from pyteomics.openms import featurexml
with featurexml.read("./wf_testing/devoncoluted.featureXML") as f:
    features_list = [FXML for FXML in f]
    
df = pd.DataFrame() 

for feat in features_list:
    idx = feat['id']
    for key in feat.keys():
        if key == 'id':
            pass
        # For col with dictionary do the following
        elif key == 'position':
            pos_list = feat['position']
            for pos in pos_list:
                if pos['dim'] == '0':
                    df.loc[idx, 'position_0'] = pos['position']
                elif pos['dim'] == '1':
                    df.loc[idx, 'position_1'] = pos['position']
        elif key == 'quality':
            qual_list = feat['quality']
            for qual in qual_list:
                if qual['dim'] == '0':
                    df.loc[idx, 'quality_0'] = qual['quality']
                elif qual['dim'] == '1':
                    df.loc[idx, 'quality_1'] = qual['quality']
        else:
            df.loc[idx, key] = feat[key]
df_tidy = df.rename(columns = {'position_0': 'mz', 'position_1': 'RT'}, inplace = False)
df_tidy=df_tidy.drop(columns= ["quality_0", "quality_1", "overallquality", "label", "legal_isotope_pattern", "Group", "is_ungrouped_with_charge", "map_idx", "adducts", "is_backbone", "dc_charge_adduct_mass", "dc_charge_adducts", "charge"] )
df_tidy.reset_index(drop=True, inplace=True) 
df_tidy

Unnamed: 0,mz,RT,intensity,FWHM,num_of_masstraces,masstrace_intensity,masstrace_centroid_rt,masstrace_centroid_mz,isotope_distances
0,45.363959952000002,164.07962840572273,1.040512e07,4.345682,2.0,"[1.040511761915735e07, 5.185541039769124e06]","[45.363959952000002, 49.049522447999998]","[164.07962840572273, 165.076340599304586]",[0.996712193581857]
1,46.594569856020002,225.123022735374661,1.885127e07,4.028248,2.0,"[1.88512695028333e07, 2.209631519695911e06]","[46.594569856020002, 45.363959952000002]","[225.123022735374661, 226.111085683240958]",[0.988062947866297]
2,49.049522447999998,236.109758813306456,2.032929e06,5.484940,2.0,"[2.032928725485437e06, 2.050767850861186e06]","[49.049522447999998, 50.283078943980001]","[236.109758813306456, 237.096823449431326]",[0.98706463612487]
3,50.283078943980001,230.104815495306468,5.875936e06,6.925932,4.0,"[5.875935802767379e06, 1.423431610895428e06, 7...","[50.283078943980001, 46.594569856020002, 46.59...","[230.104815495306468, 231.099793598237198, 232...","[0.99497810293073, 1.009776357790855, 0.989782..."
4,52.732693775999998,241.088845715144032,1.450073e06,8.214966,2.0,"[1.450073112043142e06, 7.304221857743776e05]","[52.732693775999998, 46.594569856020002]","[241.088845715144032, 242.1102383507193]",[1.021392635575268]
...,...,...,...,...,...,...,...,...,...
273,628.324208815980001,454.431447176522454,1.300941e06,8.163534,2.0,"[1.300940665560015e06, 3.834820095867582e05]","[628.324208815980001, 627.146143807980025]","[454.431447176522454, 455.434751941959803]",[1.003304765437349]
274,649.307166352019976,702.225882911624581,6.671246e05,9.478681,3.0,"[6.6712457710756e05, 4.053470682705274e05, 2.0...","[649.307166352019976, 649.307166352019976, 646...","[702.225882911624581, 703.225715714389253, 704...","[0.999832802764672, 0.996860799070078]"
275,650.483665183979952,144.983060615625391,2.72629e08,3.863073,3.0,"[2.726290032128139e08, 1.794009343807278e07, 1...","[650.483665183979952, 650.483665183979952, 650...","[144.983060615625391, 145.991959197398359, 146...","[1.008898581772968, 0.989150046298789]"
276,650.483665183979952,261.127783843722739,1.589671e06,2.551173,2.0,"[1.589670786637312e06, 2.619513818027474e05]","[650.483665183979952, 648.151223887979995]","[261.127783843722739, 262.173406629024839]",[1.045622785302101]


### Explanation of columns
#### mz= mass-to-charge ratio (m/z)
#### RT= retention time (min)
#### intensity = intensity of the feature (AU-arbitrary units)
#### FWHM= Full Width of the peak at Half its Maximum height
#### num_of_masstraces	= number of mass traces detected (single mass traces are excluded). This is relevant to the isotopic pattern
#### isotope_distances = distance in mz between the isotopes (jumps of app. 1 is important to confirm that this is a real feature)
#### 

In [81]:
import pandas as pd
import numpy as np
import sys
import pyteomics
from pyteomics import mztab
filename= "./wf_testing/out_sirius_test.mzTab"
sirius=  pyteomics.mztab.MzTab(filename, encoding='UTF8', table_format='df')
sirius.metadata
df= sirius.small_molecule_table
data= df.drop(columns= ["identifier", "smiles", "inchi_key", "description", "calc_mass_to_charge", "charge", "taxid", "species","database", "database_version", "spectra_ref", "search_engine", "modifications"])
data

Unnamed: 0,chemical_formula,exp_mass_to_charge,retention_time,best_search_engine_score[1],best_search_engine_score[2],best_search_engine_score[3],opt_global_adduct,opt_gobal_precursorFormula,opt_global_rank,opt_global_explainedPeaks,opt_global_explainedIntensity,opt_global_median_mass_error_fragment_peaks_ppm,opt_global_median_absolute_mass_error_fragment_peaks_ppm,opt_global_mass_error_precursor_ppm,opt_global_compoundId,opt_global_compoundScanNumber,opt_global_featureId,opt_global_native_id
0,C17H25BN2O2S,349.209026,123.805954,39.461411,39.461411,0.0,[M + H3N + H]+,C17H28BN3O2S,1,18,0.904774,3.793038,5.609411,-4.038156,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
1,C17H28BN3O2S,349.209026,123.805954,39.461411,39.461411,0.0,[M + H]+,C17H28BN3O2S,1,18,0.904774,3.793038,5.609411,-4.038156,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
2,C17H30BN3O3S,349.209026,123.805954,39.461411,39.461411,0.0,[M - H2O + H]+,C17H28BN3O2S,1,19,0.904774,3.793038,5.609411,-51579.350708,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
3,C15H26FN3O2S,349.209026,123.805954,35.360942,35.360942,0.0,[M + H3N + H]+,C15H29FN4O2S,2,17,0.824253,1.643125,7.922279,6.370361,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
4,C15H31FN4O3S,349.209026,123.805954,35.360942,35.360942,0.0,[M - H2O + H]+,C15H29FN4O2S,2,18,0.824253,1.643125,7.922279,-51568.942191,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
5,C15H29FN4O2S,349.209026,123.805954,35.360942,35.360942,0.0,[M + H]+,C15H29FN4O2S,2,17,0.824253,1.643125,7.922279,6.370361,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
6,C15H30BN3O2S,349.209026,123.805954,35.143974,35.143974,0.0,[M + Na]+,C15H30BN3O2S,3,19,0.910858,0.935115,6.076396,2.850807,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
7,C14H25N3O6,349.209026,123.805954,29.005809,27.444925,1.560885,[M + H3N + H]+,C14H28N4O6,4,13,0.591163,-3.824959,7.868406,2.478056,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
8,C14H28N4O6,349.209026,123.805954,29.005809,27.444925,1.560885,[M + H]+,C14H28N4O6,4,13,0.591163,-3.824959,7.868406,2.478056,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746
9,C14H30N4O7,349.209026,123.805954,29.005809,27.444925,1.560885,[M - H2O + H]+,C14H28N4O6,4,14,0.591163,-3.824959,7.868406,-51572.834496,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746


In [84]:
filename= "./wf_testing/csifingerID.mzTab"
CSI=  pyteomics.mztab.MzTab(filename, encoding='UTF8', table_format='df')
CSI.metadata
df= CSI.small_molecule_table
csifingerID= df.drop(columns= ["calc_mass_to_charge", "charge", "taxid", "species","database", "database_version", "spectra_ref", "search_engine", "modifications"])
csifingerID

Unnamed: 0,identifier,chemical_formula,smiles,inchi_key,description,exp_mass_to_charge,retention_time,best_search_engine_score[1],opt_global_rank,opt_global_compoundId,opt_global_compoundScanNumber,opt_global_featureId,opt_global_native_id,opt_global_adduct,opt_global_dblinks,opt_global_dbflags
0,123571071,C17H25BN2O2S,B1(OC(C(O1)(C)C)(C)C)C2=CC3=C(C=C2)N(SN3C)CC4CC4,QCQPLSTXPJUAAM,,349.209026,123.805954,-260.2015,1,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M + H3N + H]+,PubChem:(123571071),2
1,131432082,C17H28BN3O2S,B1(OC(C(O1)(C)C)(C)C)C2=CN=C(S2)N3CCN4CCCCC4C3,SECXVOZRGXMKEM,,349.209026,123.805954,-299.350229,1,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M + H]+,PubChem:(131432082),2
2,109567509,C15H31FN4O3S,CCNC(=NCCCF)N1CCN(CC1)S(=O)(=O)CCOC(C)C,BRHUEMIRBTXUSW,,349.209026,123.805954,-267.945346,1,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M - H2O + H]+,PubChem:(109567509),2
3,70910984,C15H26FN3O2S,CC(C)(C)S(=O)(=O)NCCCCCNC1=NC=C(C=C1)CF,PEYBFSDPVVZPQL,,349.209026,123.805954,-244.285143,1,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M + H3N + H]+,PubChem:(70910984),2
4,128994375,C15H26FN3O2S,CC1=NC(=CS1)CN2CC(CC2CN(C)CC(COC)O)F,CQZCDUGNYLCVRO,,349.209026,123.805954,-279.329205,2,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M + H3N + H]+,PubChem:(128994375),2
5,138722980,C11H17FN6O3,CC(C(CO)OC(CF)N1C=NC2=C(N=C(N=C21)N)N)O,JJXXHWZKDWLJHM,,323.122389,356.847436,-270.225119,1,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(138722980),2
6,135244532,C11H17FN6O3,CC1C(N2C(=N1)C(=NC(=N2)N)N)C3C(C(C(O3)CO)O)F,VNYAFPCEJVZLAO,,323.122389,356.847436,-299.163846,2,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(135244532),2
7,137397861|137397935,C11H17FN6O3,COC(CN)(CO)C(C(N1C=NC2=C(N=CN=C21)N)F)O,RQTBEDYRRNIRGZ,,323.122389,356.847436,-314.805122,3,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(137397861 137397935),2
8,118595172,C11H17FN6O3,C=C1NC(C=CN1C2C(C(C(O2)(CN=[N+]=[N-])CO)O)F)N,MYSZRXFXRIVGPS,,323.122389,356.847436,-438.401003,4,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(118595172),2
9,25000395,C11H20BN3O4S,B1(OC(C(O1)(C)C)(C)C)C2=CN(C=N2)S(=O)(=O)N(C)C,ZWQXNFBXPCSFAO,"1-(N,N-Dimethylsulfamoyl)imidazole-4-boronic a...",323.122389,356.847436,-259.475588,1,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(25000395),2
