## LOOP THROUGH DIRECTORY and run pyOpenMS workflow 

In [1]:
from pyopenms import *

def pyopenms_WF(filename):
    exp = MSExperiment()
    MzMLFile().load(filename, exp)
    exp.sortSpectra(True)
    
    mass_traces = []
    mtd = MassTraceDetection()
    mtd_par = mtd.getDefaults()
    mtd_par.setValue("mass_error_ppm", 10.0) 
    mtd_par.setValue("noise_threshold_int", 1.0e04)
    mtd.setParameters(mtd_par)
    mtd.run(exp, mass_traces, 0)
    
    mass_traces_split = []
    mass_traces_final = []
    epd = ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue("width_filtering", "fixed")
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_split)
    
    if (epd.getParameters().getValue("width_filtering") == "auto"):
        epd.filterByPeakWidth(mass_traces_split, mass_traces_final)
    else:
        mass_traces_final = mass_traces_split
        
    feature_map_FFM = FeatureMap()
    feat_chrom = []
    ffm = FeatureFindingMetabo()
    ffm_par = ffm.getDefaults() 
    ffm_par.setValue("isotope_filtering_model", "none")
    ffm_par.setValue("remove_single_traces", "true")
    ffm_par.setValue("mz_scoring_by_elements", "true")
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_final, feature_map_FFM, feat_chrom)
    feature_map_FFM.setUniqueIds()
    fh = FeatureXMLFile()
    fh.store('./mzML_files/wf_testing/FeatureFindingMetabo.featureXML', feature_map_FFM)
    
    mfd = MetaboliteFeatureDeconvolution()
    mdf_par = mfd.getDefaults()
    mdf_par.setValue("potential_adducts",  [b"H:+:0.6",b"Na:+:0.2",b"NH4:+:0.1", b"H2O:-:0.1"])
    mdf_par.setValue("charge_min", 1, "Minimal possible charge")
    mdf_par.setValue("charge_max", 1, "Maximal possible charge")
    mdf_par.setValue("charge_span_max", 1)
    mdf_par.setValue("max_neutrals", 1)
    mfd.setParameters(mdf_par)
    
    feature_map_DEC = FeatureMap()
    cons_map0 = ConsensusMap()
    cons_map1 = ConsensusMap()
    mfd.compute(feature_map_FFM, feature_map_DEC, cons_map0, cons_map1)
    fxml = FeatureXMLFile()
    fxml.store("./mzML_files/wf_testing/deconvoluted.featureXML", feature_map_DEC)
    
    out_mzml= "./mzML_files/wf_testing/PrecursorCorrectedGermB.mzML"
    features= FeatureMap()
    FeatureXMLFile().load("./mzML_files/wf_testing/deconvolutedGermB.featureXML", features)
    PrecursorCorrection.correctToNearestFeature(features, exp, 0.0, 100.0, True, False, False, False, 3, 0)
    MzMLFile().store(out_mzml, exp)True, delta_mzs, mzs, rts)
    
    sirius_algo = SiriusAdapterAlgorithm()
    sirius_algo_par = sirius_algo.getDefaults()
    sirius_algo_par.setValue("preprocessing:filter_by_num_masstraces", 2) 
    sirius_algo_par.setValue("preprocessing:precursor_mz_tolerance", 10.0)
    sirius_algo_par.setValue("preprocessing:precursor_mz_tolerance_unit", "ppm")
    sirius_algo_par.setValue("preprocessing:precursor_rt_tolerance", 5.0)
    sirius_algo_par.setValue("preprocessing:feature_only", "true")
    sirius_algo_par.setValue("sirius:profile", "orbitrap")
    sirius_algo_par.setValue("sirius:db", "all")
    sirius_algo_par.setValue("sirius:ions_considered", "[M+H]+, [M-H2O+H]+, [M+Na]+, [M+NH4]+")
    sirius_algo_par.setValue("sirius:candidates", 5)
    sirius_algo_par.setValue("sirius:elements_enforced", "CHNOP") 
    sirius_algo_par.setValue("project:processors", 2)
    sirius_algo.setParameters(sirius_algo_par)
    
    featureinfo = "./mzML_files/wf_testing/deconvoluted.featureXML"
    fm_info = FeatureMapping_FeatureMappingInfo()
    feature_mapping = FeatureMapping_FeatureToMs2Indices() 
    sirius_algo.preprocessingSirius(featureinfo,
                                    exp,
                                    fm_info,
                                    feature_mapping)
    sirius_algo.logFeatureSpectraNumber(featureinfo, 
                                    feature_mapping,
                                    exp)
    msfile = SiriusMSFile()
    debug_level = 10
    sirius_tmp = SiriusTemporaryFileSystemObjects(debug_level)
    siriusstring= String(sirius_tmp.getTmpMsFile())
    feature_only = sirius_algo.isFeatureOnly()
    isotope_pattern_iterations = sirius_algo.getIsotopePatternIterations()
    no_mt_info = sirius_algo.isNoMasstraceInfoIsotopePattern()
    compound_info = []
    msfile.store(exp, 
                 String(sirius_tmp.getTmpMsFile()),
                 feature_mapping, 
                 feature_only,
                 isotope_pattern_iterations, 
                 no_mt_info, 
                 compound_info)
    out_csifingerid = "./mzML_files/wf_testing/csifingerID.mzTab" 
    executable= "/Users/eeko/Desktop/software/Contents/MacOS/sirius"
    subdirs = sirius_algo.callSiriusQProcess(String(sirius_tmp.getTmpMsFile()),
                                             String(sirius_tmp.getTmpOutDir()),
                                             String(executable),
                                             String(out_csifingerid),
                                             False)
    candidates = sirius_algo.getNumberOfSiriusCandidates()
    sirius_result = MzTab()
    siriusfile = MzTabFile()
    SiriusMzTabWriter.read(subdirs,
                            filename,
                            candidates,
                            sirius_result)
    siriusfile.store("./mzML_files/wf_testing/out_sirius_test.mzTab", sirius_result)
    top_hits= 5
    csi_result=MzTab()
    csi_file=MzTabFile()
    CsiFingerIdMzTabWriter.read(subdirs,
                        filename,
                        top_hits,
                        csi_result)
    csi_file.store("./mzML_files/wf_testing/csifingerID.mzTab", csi_result)
    return "FINITO"

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [2]:
import glob
for filename in glob.glob("./mzML_files/**/*.mzML"):
    if filename.endswith(".mzML"): 
        pyopenms_WF(filename)
        print(filename)
        continue
    else:
        continue

NameError: name 'os' is not defined

In [9]:
from pandas import DataFrame
import pandas as pd

import pyteomics
from pyteomics.openms import featurexml
with featurexml.read("./mzML_files/wf_testing/deconvolutedEpemicins.featureXML") as f:
    features_list = [FXML for FXML in f]
    
df = pd.DataFrame() 

for feat in features_list:
    idx = feat['id']
    for key in feat.keys():
        if key == 'id':
            pass
        # For col with dictionary do the following
        elif key == 'position':
            pos_list = feat['position']
            for pos in pos_list:
                if pos['dim'] == '0':
                    df.loc[idx, 'position_0'] = pos['position']
                elif pos['dim'] == '1':
                    df.loc[idx, 'position_1'] = pos['position']
        elif key == 'quality':
            qual_list = feat['quality']
            for qual in qual_list:
                if qual['dim'] == '0':
                    df.loc[idx, 'quality_0'] = qual['quality']
                elif qual['dim'] == '1':
                    df.loc[idx, 'quality_1'] = qual['quality']
        else:
            df.loc[idx, key] = feat[key]
df_tidy = df.rename(columns = {'position_0': 'mz', 'position_1': 'RT'}, inplace = False)
df_tidy=df_tidy.drop(columns= ["quality_0", "quality_1", "overallquality", "label", "legal_isotope_pattern"])
df_tidy.reset_index(drop=True, inplace=True) 
df_tidy

Unnamed: 0,mz,RT,intensity,charge,FWHM,max_height,num_of_masstraces,masstrace_intensity,masstrace_centroid_rt,masstrace_centroid_mz,isotope_distances,Group,is_ungrouped_monoisotopic,dc_charge_adducts,dc_charge_adduct_mass,is_ungrouped_with_charge,map_idx,adducts,is_backbone,old_charge
0,39.529802865000001,349.172005427819727,6.483638e05,0,8.939569,75668.257812,1.0,[6.48363806042836e05],[39.529802865000001],[349.172005427819727],[],7344662219423233780,1.0,,,,,,,
1,42.379768303980001,391.182592020985794,1.612096e06,0,10.832493,183479.296875,1.0,[1.612096255726351e06],[42.379768303980001],[391.182592020985794],[],9580216245756088089,1.0,,,,,,,
2,42.379768303980001,404.190102738177075,3.534303e05,1,6.644110,129759.664062,2.0,"[3.534302611248636e05, 8.032523655859887e05]","[42.379768303980001, 42.379768303980001]","[404.190102738177075, 405.198191650642855]",[1.00808891246578],3143517962179844245,,H1,1.007276,1.0,,,,
3,42.379768303980001,461.235803441382416,3.050932e05,0,4.296739,88459.125000,1.0,[3.050931746679126e05],[42.379768303980001],[461.235803441382416],[],4920700711826062937,1.0,,,,,,,
4,43.264481632020001,361.208104366155453,2.066669e06,0,5.489381,487036.281250,1.0,[2.066669269290893e06],[43.264481632020001],[361.208104366155453],[],4564980864005611065,1.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14064,917.212863695999999,1321.983679608177454,2.423242e05,0,11.887800,29230.892578,1.0,[2.423241585391494e05],[917.212863695999999],[1321.983679608177454],[],13327628041150387155,1.0,,,,,,,
14065,918.210342559979949,420.319015627257613,2.520901e05,0,6.803956,42458.042969,1.0,[2.520900600146166e05],[918.210342559979949],[420.319015627257613],[],2577607210725735607,1.0,,,,,,,
14066,918.210342559979949,1421.977424770406287,1.326787e05,0,4.103276,38798.972656,1.0,[1.3267872324558e05],[918.210342559979949],[1421.977424770406287],[],17437542222900682844,1.0,,,,,,,
14067,918.210342559979949,1521.970953123048957,2.119047e05,0,6.773869,37788.941406,1.0,[2.119047253454692e05],[918.210342559979949],[1521.970953123048957],[],1198728414200903008,1.0,,,,,,,


### Explanation of columns
#### mz= mass-to-charge ratio (m/z)
#### RT= retention time (min)
#### intensity = intensity of the feature (AU-arbitrary units)
#### FWHM= Full Width of the peak at Half its Maximum height
#### num_of_masstraces	= number of mass traces detected (single mass traces are excluded). This is relevant to the isotopic pattern
#### isotope_distances = distance in mz between the isotopes (jumps of app. 1 is important to confirm that this is a real feature)
#### 

In [None]:
import pandas as pd
import numpy as np
import sys
import pyteomics
from pyteomics import mztab
filename= "./mzML_files/wf_testing/out_sirius_testGermB.mzTab.mzTab"
sirius=  pyteomics.mztab.MzTab(filename, encoding='UTF8', table_format='df')
sirius.metadata
df= sirius.small_molecule_table
data= df.drop(columns= ["identifier", "smiles", "inchi_key", "description", "calc_mass_to_charge", "charge", "taxid", "species","database", "database_version", "spectra_ref", "search_engine", "modifications"])
data

In [4]:
filename= "./mzML_files/wf_testing/csifingerID.mzTab"
CSI=  pyteomics.mztab.MzTab(filename, encoding='UTF8', table_format='df')
CSI.metadata
df= CSI.small_molecule_table
csifingerID= df.drop(columns= ["calc_mass_to_charge", "charge", "taxid", "species","database", "database_version", "spectra_ref", "search_engine", "modifications"])
csifingerID

Unnamed: 0,identifier,chemical_formula,smiles,inchi_key,description,exp_mass_to_charge,retention_time,best_search_engine_score[1],opt_global_rank,opt_global_compoundId,opt_global_compoundScanNumber,opt_global_featureId,opt_global_native_id,opt_global_adduct,opt_global_dblinks,opt_global_dbflags
0,123571071,C17H25BN2O2S,B1(OC(C(O1)(C)C)(C)C)C2=CC3=C(C=C2)N(SN3C)CC4CC4,QCQPLSTXPJUAAM,,349.209026,123.805954,-260.2015,1,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M + H3N + H]+,PubChem:(123571071),2
1,131432082,C17H28BN3O2S,B1(OC(C(O1)(C)C)(C)C)C2=CN=C(S2)N3CCN4CCCCC4C3,SECXVOZRGXMKEM,,349.209026,123.805954,-299.350229,1,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M + H]+,PubChem:(131432082),2
2,109567509,C15H31FN4O3S,CCNC(=NCCCF)N1CCN(CC1)S(=O)(=O)CCOC(C)C,BRHUEMIRBTXUSW,,349.209026,123.805954,-267.945346,1,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M - H2O + H]+,PubChem:(109567509),2
3,70910984,C15H26FN3O2S,CC(C)(C)S(=O)(=O)NCCCCCNC1=NC=C(C=C1)CF,PEYBFSDPVVZPQL,,349.209026,123.805954,-244.285143,1,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M + H3N + H]+,PubChem:(70910984),2
4,128994375,C15H26FN3O2S,CC1=NC(=CS1)CN2CC(CC2CN(C)CC(COC)O)F,CQZCDUGNYLCVRO,,349.209026,123.805954,-279.329205,2,745,746,id_6128946280250909851,controllerType=0 controllerNumber=1 scan=746,[M + H3N + H]+,PubChem:(128994375),2
5,138722980,C11H17FN6O3,CC(C(CO)OC(CF)N1C=NC2=C(N=C(N=C21)N)N)O,JJXXHWZKDWLJHM,,323.122389,356.847436,-270.225119,1,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(138722980),2
6,135244532,C11H17FN6O3,CC1C(N2C(=N1)C(=NC(=N2)N)N)C3C(C(C(O3)CO)O)F,VNYAFPCEJVZLAO,,323.122389,356.847436,-299.163846,2,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(135244532),2
7,137397861|137397935,C11H17FN6O3,COC(CN)(CO)C(C(N1C=NC2=C(N=CN=C21)N)F)O,RQTBEDYRRNIRGZ,,323.122389,356.847436,-314.805122,3,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(137397861 137397935),2
8,118595172,C11H17FN6O3,C=C1NC(C=CN1C2C(C(C(O2)(CN=[N+]=[N-])CO)O)F)N,MYSZRXFXRIVGPS,,323.122389,356.847436,-438.401003,4,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(118595172),2
9,25000395,C11H20BN3O4S,B1(OC(C(O1)(C)C)(C)C)C2=CN(C=N2)S(=O)(=O)N(C)C,ZWQXNFBXPCSFAO,"1-(N,N-Dimethylsulfamoyl)imidazole-4-boronic a...",323.122389,356.847436,-259.475588,1,2846,2847,id_11377481446250170993,controllerType=0 controllerNumber=1 scan=2847,[M + Na]+,PubChem:(25000395),2
