In [1]:
from pyopenms import *
import pandas as pd
import numpy as np

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [3]:
class ConsensusMapDF(ConsensusMap):

    def __init__(self):
        super().__init__()

    def get_intensity_df(self):
        labelfree = self.getExperimentType() == "label-free"
        filemeta = self.getColumnHeaders()  # type: dict[int, ColumnHeader]
        labels = list(set([header.label for header in
                           filemeta.values()]))  # TODO could be more efficient. Do we require same channels in all files?
        files = list(set([header.filename for header in filemeta.values()]))
        label_to_idx = {k: v for v, k in enumerate(labels)}
        file_to_idx = {k: v for v, k in enumerate(files)}

        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        if not labelfree:
            # TODO write two functions for LF and labelled. One has only one channel, the other has only one file per CF
            def extractRowBlocksChannelWideFileLong(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                filerows = defaultdict(lambda: [0] * len(labels))  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row = filerows[header.filename]
                    row[label_to_idx[header.label]] = fh.getIntensity()
                return (f.getUniqueId(), filerows)

            def extractRowsChannelWideFileLong(f: ConsensusFeature):
                uniqueid, rowdict = extractRowBlocksChannelWideFileLong(f)
                for file, row in rowdict.items():
                    row.append(file)
                    yield tuple([uniqueid] + row)

            if len(labels) == 1:
                labels[0] = "intensity"
            dtypes = [('id', np.dtype('uint64'))] + list(zip(labels, ['f'] * len(labels)))
            dtypes.append(('file', 'U300'))
            # For TMT we know that every feature can only be from one file, since feature = PSM
            #cnt = 0
            #for f in self:
            #    cnt += f.size()

            intyarr = np.fromiter(iter=gen(self, extractRowsChannelWideFileLong), dtype=dtypes, count=self.size())
            return pd.DataFrame(intyarr).set_index('id')
        else:
            # Specialized for LabelFree which has to have only one channel
            def extractRowBlocksChannelLongFileWideLF(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                row = [0.] * len(files)  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row[file_to_idx[header.filename]] = fh.getIntensity()
                yield tuple([f.getUniqueId()] + row)

            dtypes = [('id', np.dtype('uint64'))] + list(zip(files, ['f'] * len(files)))
            # cnt = self.size()*len(files) # TODO for this to work, we would need to fill with NAs for CFs that do not go over all files
            cnt = self.size()

            intyarr = np.fromiter(iter=gen(self, extractRowBlocksChannelLongFileWideLF), dtype=dtypes, count=cnt)
            return pd.DataFrame(intyarr).set_index('id')

    def get_metadata_df(self):
        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        def extractMetaData(f: ConsensusFeature):
            # subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
            pep = f.getPeptideIdentifications()  # type: list[PeptideIdentification]
            if len(pep) != 0:
                hits = pep[0].getHits()
                if len(hits) != 0:
                    besthit = hits[0]  # type: PeptideHit
                    # TODO what else
                    yield f.getUniqueId(), besthit.getSequence().toString(), f.getCharge(), f.getRT(), f.getMZ(), f.getQuality(), f.getWidth()
                else:
                    yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality(), f.getWidth()
            else:
                yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality(), f.getWidth()

        cnt = self.size()

        mddtypes = [('id', np.dtype('uint64')), ('sequence', 'U200'), ('charge', 'f'), ('RT', 'f'), ('mz', 'f'),
                    ('quality', 'f'), ('width', 'f')]
        mdarr = np.fromiter(iter=gen(self, extractMetaData), dtype=mddtypes, count=cnt)
        return pd.DataFrame(mdarr).set_index('id').drop(columns="sequence")

In [108]:
input_consensus= "results/GNPSexport/interim/filtered.consensusXML"
cmap = ConsensusMap()
ConsensusXMLFile().load(input_consensus, cmap)

file_descriptions = cmap.getColumnHeaders()

consensus_map= ConsensusMapDF()
for f in cmap:
    consensus_map.push_back(f)
    # get intensities as a DataFrame
    consensus_map.setColumnHeaders(file_descriptions)

    intensities = consensus_map.get_intensity_df()

    # get meta data as DataFrame
    meta_data = consensus_map.get_metadata_df()

    # you can concatenate these two for a "result" DataFrame
    result = pd.concat([meta_data, intensities], axis=1)

    # if you don't need labeled index, remove it (and/or save with index = False)
    result.reset_index(drop=True, inplace=True)

    idx = 0
    new_col = "CONSENSUS"  # can be a list, a Series, an array or a scalar   
    result.insert(loc=idx, column='#CONSENSUS', value=new_col)   
    result= result.rename(columns= {"charge": "charge_cf", "RT": "rt_cf", "mz": "mz_cf", "quality": "quality_cf", "width": "width_cf"})

In [109]:
print(result)

     #CONSENSUS  charge_cf       rt_cf       mz_cf  quality_cf  width_cf  \
0     CONSENSUS        1.0  296.581818  393.223236    0.000446       0.0   
1     CONSENSUS        1.0  141.109558  379.207520    0.000301       0.0   
2     CONSENSUS        1.0   79.803093  254.161209    0.003344       0.0   
3     CONSENSUS        1.0  315.842651  407.238861    0.000433       0.0   
4     CONSENSUS        1.0  151.219208  328.139069    0.001450       0.0   
...         ...        ...         ...         ...         ...       ...   
6682  CONSENSUS        1.0  144.461273  450.255981    0.000009       0.0   
6683  CONSENSUS        1.0  364.670105  380.218140    0.000024       0.0   
6684  CONSENSUS        1.0  283.771118  482.272247    0.000082       0.0   
6685  CONSENSUS        1.0  343.010742  399.223816    0.000128       0.0   
6686  CONSENSUS        1.0  245.589066  361.207977    0.000035       0.0   

      results/interim/PCpeak_20211006_UMETAB222_POS_41_DNPM_Plate-11_NBC_00843_blank.mz

In [110]:
filemeta= cmap.getColumnHeaders()
mapIDs = [k for k in filemeta.keys()]
filename= []
size=[]
label= []
for header in filemeta.values():
    files= header.filename
    sizes= header.size
    labels= header.label
    filename.append(files)
    size.append(sizes)
    label.append(labels)

dict = {'id': mapIDs, 'filename': filename,'label': label,'size': size}
DF= pd.DataFrame(dict)
idx = 0
new_col = "MAP"  # can be a list, a Series, an array or a scalar   
DF.insert(loc=idx, column='#MAP', value=new_col)   
DF

Unnamed: 0,#MAP,id,filename,label,size
0,MAP,0,results/interim/PCpeak_20210826_UMETAB219_POS_...,,2776
1,MAP,1,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1253
2,MAP,2,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1345
3,MAP,3,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1353
4,MAP,4,results/interim/PCpeak_20210826_UMETAB219_POS_...,,3290
5,MAP,5,results/interim/PCpeak_20210826_UMETAB219_POS_...,,2979
6,MAP,6,results/interim/PCpeak_20210826_UMETAB219_POS_...,,3070
7,MAP,7,results/interim/PCpeak_20210826_UMETAB219_POS_...,,3044
8,MAP,8,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1351
9,MAP,9,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1026


In [113]:
cols= result.columns
for col in cols:
    for i, path in enumerate(filename):
        if path== col:
            name= DF["id"][i]
            result.rename(columns={col: name}, inplace=True)

df_flt= result.filter(regex=fr"[0-9]").sort_index(axis=1)
cols_flt= df_flt.columns
for col_flt in cols_flt:
    for col in cols:
        if col == col_flt:
            col_new = "intensity_"+ str(col)
            result.rename(columns={col: col_new}, inplace=True)
result

Unnamed: 0,#CONSENSUS,charge_cf,rt_cf,mz_cf,quality_cf,width_cf,intensity_31,intensity_41,intensity_19,intensity_16,...,intensity_44,intensity_37,intensity_35,intensity_5,intensity_18,intensity_2,intensity_21,intensity_11,intensity_4,intensity_6
0,CONSENSUS,1.0,296.581818,393.223236,0.000446,0.0,1.300701e+06,5242951.0,4781326.0,8919085.0,...,3228996.0,9957446.0,1.795107e+07,22427880.0,6956869.0,13423620.0,2087019.0,3217179.0,30551950.0,23460470.0
1,CONSENSUS,1.0,141.109558,379.207520,0.000301,0.0,9.040833e+05,2025389.0,2618161.0,2824778.0,...,3579418.0,6640277.0,3.621176e+06,8378248.0,2312600.0,9434231.0,4913860.0,3032641.0,12793220.0,8881861.0
2,CONSENSUS,1.0,79.803093,254.161209,0.003344,0.0,1.113518e+08,16124980.0,7042243.0,12672740.0,...,13681260.0,79692448.0,4.539911e+07,49578168.0,7442163.0,137001200.0,10477380.0,33497830.0,48087800.0,55878080.0
3,CONSENSUS,1.0,315.842651,407.238861,0.000433,0.0,8.482626e+05,2733415.0,4937243.0,6982376.0,...,3818414.0,15664220.0,5.336683e+06,15748400.0,2697249.0,19377870.0,1094928.0,2050646.0,19822080.0,15444460.0
4,CONSENSUS,1.0,151.219208,328.139069,0.001450,0.0,6.652861e+06,8776934.0,10517510.0,19129270.0,...,42571260.0,8819608.0,2.678875e+07,50182540.0,9347569.0,53082512.0,13239720.0,12750610.0,55692720.0,48305968.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6682,CONSENSUS,1.0,144.461273,450.255981,0.000009,0.0,0.000000e+00,0.0,0.0,0.0,...,0.0,0.0,3.391787e+05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6683,CONSENSUS,1.0,364.670105,380.218140,0.000024,0.0,0.000000e+00,0.0,0.0,0.0,...,0.0,0.0,8.230814e+05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6684,CONSENSUS,1.0,283.771118,482.272247,0.000082,0.0,0.000000e+00,0.0,0.0,0.0,...,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6685,CONSENSUS,1.0,343.010742,399.223816,0.000128,0.0,0.000000e+00,0.0,0.0,0.0,...,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
result= result.sort_index(axis=1)
cols = result.columns
preordered = ["#CONSENSUS", "charge_cf", "rt_cf", "mz_cf", "quality_cf", "width_cf"]
new_cols = preordered + [c for c in result.columns if c not in preordered]
new_df = result.reindex(columns=new_cols)
new_df.to_csv('results/GNPSexport/FeatureQuantificationTable.txt', sep = '\t', index = False)
print(new_df)

     #CONSENSUS  charge_cf       rt_cf       mz_cf  quality_cf  width_cf  \
0     CONSENSUS        1.0  296.581818  393.223236    0.000446       0.0   
1     CONSENSUS        1.0  141.109558  379.207520    0.000301       0.0   
2     CONSENSUS        1.0   79.803093  254.161209    0.003344       0.0   
3     CONSENSUS        1.0  315.842651  407.238861    0.000433       0.0   
4     CONSENSUS        1.0  151.219208  328.139069    0.001450       0.0   
...         ...        ...         ...         ...         ...       ...   
6682  CONSENSUS        1.0  144.461273  450.255981    0.000009       0.0   
6683  CONSENSUS        1.0  364.670105  380.218140    0.000024       0.0   
6684  CONSENSUS        1.0  283.771118  482.272247    0.000082       0.0   
6685  CONSENSUS        1.0  343.010742  399.223816    0.000128       0.0   
6686  CONSENSUS        1.0  245.589066  361.207977    0.000035       0.0   

      intensity_0  intensity_1  intensity_10  intensity_11  ...  intensity_43  \
0     