In [1]:
from pyopenms import *
import pyopenms as pms
import pandas as pd
import numpy as np
import os
import glob

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [2]:
import csv
# read tsv file and create list of FeatureFinderMetaboIdentCompound
def metaboTableFromFile(path_to_library_file):
    metaboTable = []
    with open(path_to_library_file, 'r') as tsv_file:
        tsv_reader = csv.reader(tsv_file, delimiter="\t")
        next(tsv_reader) # skip header
        for row in tsv_reader:
            metaboTable.append(FeatureFinderMetaboIdentCompound(
                row[0], # name
                row[1], # sum formula
                float(row[2]), # mass
                [int(charge) for charge in row[3].split(',')], # charges
                [float(rt) for rt in row[4].split(',')], # RTs
                [float(rt_range) for rt_range in row[5].split(',')], # RT ranges
                [float(iso_distrib) for iso_distrib in row[6].split(',')] # isotope distributions
            ))
    return metaboTable

In [12]:
import glob
from pyopenms import *

input_mzml_files=glob.glob("results/interim/*.mzML")
# load ms data from mzML file into MSExperiment
for mzml_file in input_mzml_files:
    spectra = MSExperiment()
    MzMLFile().load(mzml_file, spectra)

    # create FeatureFinderAlgorithmMetaboIdent and assign ms data
    ff = FeatureFinderAlgorithmMetaboIdent()
    ff.setMSData(spectra)

    # read library generate a metabo table with compounds
    metabo_table = metaboTableFromFile('MetaboliteIdentification.tsv')

    # FeatureMap to store results
    fm = FeatureMap()

    # edit some parameters
    params = ff.getParameters()
    params[b'extract:mz_window'] = 5.0 
    params[b'model:type']= b'none'
    #params[b'extract:rt_window'] = 5.0 
    params[b'detect:peak_width'] = 60.0 
    ff.setParameters(params)

    # run the FeatureFinderMetaboIdent with the metabo_table and store results in fm
    ff.run(metabo_table, fm)

    # save FeatureMap to file
    ff_file = os.path.join("results", "", "FFMI", "", "interim", "", 'FFMI_' + os.path.basename(mzml_file)[19:-5] +".featureXML")
    FeatureXMLFile().store(ff_file, fm) 

RT window size calculated as 240 seconds.
Extracting chromatograms...
<RT window size calculated as 240 seconds.> occurred 2 times
Detecting chromatographic peaks...
Found 4 feature candidates in total.
3 features left after selection of best candidates.
No overlaps between features found.
RT window size calculated as 240 seconds.
Extracting chromatograms...


Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 566.277276466771013.
Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 567.280631304570989.
Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 561.357276466771054.
Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 562.360631304571029.
Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 184.180631304570

<RT window size calculated as 240 seconds.> occurred 2 times
Detecting chromatographic peaks...
Found 16 feature candidates in total.
11 features left after selection of best candidates.
No overlaps between features found.
RT window size calculated as 240 seconds.


Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 562.360631304571029.
Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 183.177276466770991.
Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 184.180631304570994.
Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 197.117276466771017.
Found no signal. The Gaussian width is probably smaller than the spacing in your chromatogram data. Try to use a bigger width. The error occurred in the chromatogram with m/z time 198.120631304571

Extracting chromatograms...
<RT window size calculated as 240 seconds.> occurred 2 times
Detecting chromatographic peaks...
Found 3 feature candidates in total.
3 features left after selection of best candidates.
No overlaps between features found.


In [13]:
from collections import defaultdict
from functools import reduce
from pathlib import Path
from time import perf_counter
import sys

from IPython.core.display import display
from pandas import CategoricalDtype
import numpy as np
from pyopenms import *
import pandas as pd
import os

common_meta_value_types = {
    b'label': 'U30',
    b'spectrum_index': 'i',
    b'score_fit': 'f',
    b'score_correlation': 'f',
    b'FWHM': 'f',
    b'spectrum_native_id': 'U30',
    b"num_of_masstraces" : "f"
}

class FeatureMapDF(FeatureMap):
    def __init__(self):
        super().__init__()
    
    # meta_values = None (default), 'all' or list of meta value names
    def get_df(self, meta_values = None):
        # get all possible meta value keys in a set
        if meta_values == 'all':
            meta_values = set()
            for f in self:
                mvs = []
                f.getKeys(mvs)
                for m in mvs:
                    meta_values.add(m)
        elif not meta_values: # if None, set to empty list
            meta_values = []
        
        def gen(fmap: FeatureMap, fun):
            for f in fmap:
                yield from fun(f, meta_values)

        def extract_meta_data(f: Feature, meta_values):
            pep = f.getPeptideIdentifications()  # type: list[PeptideIdentification]
            bb = f.getConvexHull().getBoundingBox2D()
                
            vals = [f.getMetaValue(m) if f.metaValueExists(m) else np.NA for m in meta_values]   # find some NA or None value for numpy
            
            if len(pep) != 0:
                hits = pep[0].getHits()

                if len(hits) != 0:
                    besthit = hits[0]  # type: PeptideHit
                    yield f.getUniqueId(), besthit.getSequence().toString(), f.getCharge(), f.getRT(), f.getMZ(), bb[0][0], bb[1][0], f.getMetaValue("PeptideRef"), f.getOverallQuality(), f.getIntensity()
                else:
                    yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), bb[0][0], bb[1][0], f.getMetaValue("PeptideRef"), f.getOverallQuality(), f.getIntensity()
            else:
                yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), bb[0][0], bb[1][0], f.getMetaValue("PeptideRef"), f.getOverallQuality(), f.getIntensity()

        cnt = self.size()

        mddtypes = [('id', np.dtype('uint64')), ('sequence', 'U200'), ('charge', 'i4'), ('RT', 'f'), ('mz', 'f'),
                    ('RTstart', 'f'), ('RTend', 'f'), ("PeptideRef", 'U200'),
                    ('quality', 'f'), ('intensity', 'f')]
        
        for meta_value in meta_values:
            if meta_value in common_meta_value_types:
                mddtypes.append((meta_value.decode(), common_meta_value_types[meta_value]))
            else:
                mddtypes.append((meta_value.decode(), 'U50'))
        mdarr = np.fromiter(iter=gen(self, extract_meta_data), dtype=mddtypes, count=cnt)
        df= pd.DataFrame(mdarr).set_index('id').sort_values("mz").drop(columns= "sequence")
        #df= df[df["num_of_masstraces"]>=2]
        return df

In [14]:
input_feature_files = glob.glob('results/FFMI/interim/*.featureXML')

for filename in input_feature_files:
    fmap = FeatureMapDF()
    FeatureXMLFile().load(filename, fmap)
    DF= fmap.get_df()
    feature_csv= os.path.join("results", "", "features", "", 'features_' + os.path.basename(filename)[:-10] +"csv")
    DF.to_csv(feature_csv)
    display(DF)

Unnamed: 0_level_0,charge,RT,mz,RTstart,RTend,PeptideRef,quality,intensity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1339109921760416090,1,170.770813,155.077271,151.374359,189.513565,pyracrimycinB_m154.070007_z1_rt69.540001,-39.089714,2090.289
7380814914779683766,1,419.851776,183.177277,397.155243,442.770538,Geosmin_m182.169998_z1_rt383.359985,-46.314556,684250.5
16462145729752871259,1,97.29303,197.117279,74.907112,131.431396,GermicidinA_m196.110001_z1_rt52.169998,1.25164,217918800.0


Unnamed: 0_level_0,charge,RT,mz,RTstart,RTend,PeptideRef,quality,intensity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7729778572771227138,1,112.350494,155.077271,88.442154,138.2444,pyracrimycinB_m154.070007_z1_rt69.540001,-46.945381,56278.04
2347631166783273420,1,85.384026,402.207275,78.484856,94.855621,bioxalomycina2_m401.200012_z1_rt48.18,-1.147034,47812.77
4963020928692728669,1,96.082985,474.207275,84.971153,107.522148,Unknown_m473.200012_z1_rt44.200001,1.16475,51872.55
13345230329355082380,1,205.358475,566.277283,182.577362,228.045013,Coelichelin_m565.27002_z1_rt268.049988,1.699761,28393.19
5129719394038064128,1,322.761993,585.3573,299.740387,345.75885,dehydroxynocardamine_m584.349976_z1_rt351.070007,-2.600719,303829.1
214447840835502322,1,292.005249,601.3573,268.920013,318.552521,desferrioxamineE_m600.349976_z1_rt329.5,-45.063755,93752.97
218027988267685097,2,379.832764,712.417297,350.739044,410.304291,epemicinB_m1422.819946_z2_rt405.769989,-4.927414,5534543000.0
7031124889291022658,2,367.947662,793.447266,325.8526,405.027466,epemicinA_m1584.880005_z2_rt386.850006,1.013541,7007122000.0
1810027935436755859,1,562.085938,797.417297,536.413635,594.306641,Kirromycin_m796.409973_z1_rt618.98999,-1.465651,1030751.0
7297192142527356428,1,379.606842,1423.827271,356.964844,402.691406,epemicinB_m1422.819946_z1_rt405.769989,1.255799,89919900.0


Unnamed: 0_level_0,charge,RT,mz,RTstart,RTend,PeptideRef,quality,intensity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4751818285601435604,1,46.820068,155.077271,42.465252,72.085068,pyracrimycinB_m154.070007_z1_rt69.540001,-31.816729,4516.973633
9210174291602166478,1,43.790009,197.117279,42.465252,67.320007,GermicidinA_m196.110001_z1_rt52.169998,-1.258918,34066.988281
12062139129896561461,1,231.206726,561.3573,208.560028,253.926498,DesferrioxamineB_m560.349976_z1_rt225.770004,-44.783695,44792.410156
