In [2]:
from pyopenms import *
import pyopenms as pms
import pandas as pd
import numpy as np

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [3]:
class ConsensusMapDF(ConsensusMap):
    def __init__(self):
        super().__init__()

    def get_intensity_df(self):
        labelfree = self.getExperimentType() == "label-free"
        filemeta = self.getColumnHeaders()  # type: dict[int, ColumnHeader]
        labels = list(set([header.label for header in
                           filemeta.values()]))  # TODO could be more efficient. Do we require same channels in all files?
        files = list(set([header.filename for header in filemeta.values()]))
        label_to_idx = {k: v for v, k in enumerate(labels)}
        file_to_idx = {k: v for v, k in enumerate(files)}

        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        if not labelfree:
            # TODO write two functions for LF and labelled. One has only one channel, the other has only one file per CF
            def extractRowBlocksChannelWideFileLong(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                filerows = defaultdict(lambda: [0] * len(labels))  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row = filerows[header.filename]
                    row[label_to_idx[header.label]] = fh.getIntensity()
                return (f.getUniqueId(), filerows)

            def extractRowsChannelWideFileLong(f: ConsensusFeature):
                uniqueid, rowdict = extractRowBlocksChannelWideFileLong(f)
                for file, row in rowdict.items():
                    row.append(file)
                    yield tuple([uniqueid] + row)

            if len(labels) == 1:
                labels[0] = "intensity"
            dtypes = [('id', np.dtype('uint64'))] + list(zip(labels, ['f'] * len(labels)))
            dtypes.append(('file', 'U300'))
            # For TMT we know that every feature can only be from one file, since feature = PSM
            #cnt = 0
            #for f in self:
            #    cnt += f.size()

            intyarr = np.fromiter(iter=gen(self, extractRowsChannelWideFileLong), dtype=dtypes, count=self.size())
            return pd.DataFrame(intyarr).set_index('id')
        else:
            # Specialized for LabelFree which has to have only one channel
            def extractRowBlocksChannelLongFileWideLF(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                row = [0.] * len(files)  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row[file_to_idx[header.filename]] = fh.getIntensity()
                yield tuple([f.getUniqueId()] + row)

            dtypes = [('id', np.dtype('uint64'))] + list(zip(files, ['f'] * len(files)))
            # cnt = self.size()*len(files) # TODO for this to work, we would need to fill with NAs for CFs that do not go over all files
            cnt = self.size()

            intyarr = np.fromiter(iter=gen(self, extractRowBlocksChannelLongFileWideLF), dtype=dtypes, count=cnt)
            return pd.DataFrame(intyarr).set_index('id')

    def get_metadata_df(self):
        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        def extractMetaData(f: ConsensusFeature):
            # subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
            pep = f.getPeptideIdentifications()  # type: list[PeptideIdentification]
            if len(pep) != 0:
                hits = pep[0].getHits()
                if len(hits) != 0:
                    besthit = hits[0]  # type: PeptideHit
                    # TODO what else
                    yield f.getUniqueId(), besthit.getSequence().toString(), f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()
                else:
                    yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()
            else:
                yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()

        cnt = self.size()

        mddtypes = [('id', np.dtype('uint64')), ('sequence', 'U200'), ('charge', 'f'), ('RT', 'f'), ('mz', 'f'),
                    ('quality', 'f')]
        mdarr = np.fromiter(iter=gen(self, extractMetaData), dtype=mddtypes, count=cnt)
        return pd.DataFrame(mdarr).set_index('id')

In [5]:
input_consensus= "results/consensus/filtered_pyopenms.consensusXML"
cmap = ConsensusMapDF()
ConsensusXMLFile().load(input_consensus, cmap)
display(cmap.get_intensity_df())


Unnamed: 0_level_0,results/interim/precursorcorrected_GermicidinB.mzML,results/interim/precursorcorrected_Epemicins.mzML,results/interim/precursorcorrected_GermicidinA.mzML
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.466705e+06,0.000000e+00,6556507.0
0,0.000000e+00,8.264692e+05,0.0
0,0.000000e+00,5.479869e+06,0.0
0,0.000000e+00,5.667535e+06,0.0
0,0.000000e+00,2.320436e+06,0.0
...,...,...,...
0,8.501905e+07,0.000000e+00,0.0
0,1.287144e+10,0.000000e+00,0.0
0,1.005677e+06,0.000000e+00,0.0
0,4.475200e+07,0.000000e+00,0.0


In [11]:
file_descriptions = cmap.getColumnHeaders()
#file_description = file_descriptions.get(i, ColumnHeader())
#file_description.size = cmap.size()
#file_description.unique_id = cmap.getUniqueId()
#file_descriptions = file_description


    
# get intensities as a DataFrame
cmap.setColumnHeaders(file_descriptions)

intensities = cmap.get_intensity_df()

# get meta data as DataFrame
meta_data = cmap.get_metadata_df()[['RT', 'mz', "charge"]]

# you can concatenate these two for a "result" DataFrame
result = pd.concat([meta_data, intensities], axis=1)

# if you don't need labeled index, remove it (and/or save with index = False)
result.reset_index(drop=True, inplace=True)

# store as tsv file
result.to_csv('results/consensus/Filtered_Consensus.tsv', sep = '\t', index = False)
result
        

Unnamed: 0,RT,mz,charge,results/interim/precursorcorrected_GermicidinB.mzML,results/interim/precursorcorrected_Epemicins.mzML,results/interim/precursorcorrected_GermicidinA.mzML
0,538.329712,338.341888,1.0,8.466705e+06,0.000000e+00,6556507.0
1,751.013062,906.672852,1.0,0.000000e+00,8.264692e+05,0.0
2,214.965652,374.228729,1.0,0.000000e+00,5.479869e+06,0.0
3,412.114746,384.295776,1.0,0.000000e+00,5.667535e+06,0.0
4,452.031586,425.311188,1.0,0.000000e+00,2.320436e+06,0.0
...,...,...,...,...,...,...
1039,-1274.846924,150.951096,1.0,8.501905e+07,0.000000e+00,0.0
1040,-97.074303,183.101624,1.0,1.287144e+10,0.000000e+00,0.0
1041,444.768036,235.169296,1.0,1.005677e+06,0.000000e+00,0.0
1042,-1274.846924,223.943649,1.0,4.475200e+07,0.000000e+00,0.0
