In [1]:
from pyopenms import *
import pandas as pd
import numpy as np

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [3]:
class ConsensusMapDF(ConsensusMap):

    def __init__(self):
        super().__init__()

    def get_intensity_df(self):
        labelfree = self.getExperimentType() == "label-free"
        filemeta = self.getColumnHeaders()  # type: dict[int, ColumnHeader]
        labels = list(set([header.label for header in
                           filemeta.values()]))  # TODO could be more efficient. Do we require same channels in all files?
        files = list(set([header.filename for header in filemeta.values()]))
        label_to_idx = {k: v for v, k in enumerate(labels)}
        file_to_idx = {k: v for v, k in enumerate(files)}

        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        if not labelfree:
            # TODO write two functions for LF and labelled. One has only one channel, the other has only one file per CF
            def extractRowBlocksChannelWideFileLong(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                filerows = defaultdict(lambda: [0] * len(labels))  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row = filerows[header.filename]
                    row[label_to_idx[header.label]] = fh.getIntensity()
                return (f.getUniqueId(), filerows)

            def extractRowsChannelWideFileLong(f: ConsensusFeature):
                uniqueid, rowdict = extractRowBlocksChannelWideFileLong(f)
                for file, row in rowdict.items():
                    row.append(file)
                    yield tuple([uniqueid] + row)

            if len(labels) == 1:
                labels[0] = "intensity"
            dtypes = [('id', np.dtype('uint64'))] + list(zip(labels, ['f'] * len(labels)))
            dtypes.append(('file', 'U300'))
            # For TMT we know that every feature can only be from one file, since feature = PSM
            #cnt = 0
            #for f in self:
            #    cnt += f.size()

            intyarr = np.fromiter(iter=gen(self, extractRowsChannelWideFileLong), dtype=dtypes, count=self.size())
            return pd.DataFrame(intyarr).set_index('id')
        else:
            # Specialized for LabelFree which has to have only one channel
            def extractRowBlocksChannelLongFileWideLF(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                row = [0.] * len(files)  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row[file_to_idx[header.filename]] = fh.getIntensity()
                yield tuple([f.getUniqueId()] + row)

            dtypes = [('id', np.dtype('uint64'))] + list(zip(files, ['f'] * len(files)))
            # cnt = self.size()*len(files) # TODO for this to work, we would need to fill with NAs for CFs that do not go over all files
            cnt = self.size()

            intyarr = np.fromiter(iter=gen(self, extractRowBlocksChannelLongFileWideLF), dtype=dtypes, count=cnt)
            return pd.DataFrame(intyarr).set_index('id')

    def get_metadata_df(self):
        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        def extractMetaData(f: ConsensusFeature):
            # subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
            pep = f.getPeptideIdentifications()  # type: list[PeptideIdentification]
            if len(pep) != 0:
                hits = pep[0].getHits()
                if len(hits) != 0:
                    besthit = hits[0]  # type: PeptideHit
                    # TODO what else
                    yield f.getUniqueId(), besthit.getSequence().toString(), f.getCharge(), f.getRT(), f.getMZ(), f.getQuality(), f.getWidth()
                else:
                    yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality(), f.getWidth()
            else:
                yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality(), f.getWidth()

        cnt = self.size()

        mddtypes = [('id', np.dtype('uint64')), ('sequence', 'U200'), ('charge', 'f'), ('RT', 'f'), ('mz', 'f'),
                    ('quality', 'f'), ('width', 'f')]
        mdarr = np.fromiter(iter=gen(self, extractMetaData), dtype=mddtypes, count=cnt)
        return pd.DataFrame(mdarr).set_index('id').drop(columns="sequence")

In [4]:
input_consensus= "results/GNPSexport/interim/filtered.consensusXML"
cmap = ConsensusMap()
ConsensusXMLFile().load(input_consensus, cmap)
new_map= ConsensusMap(cmap)
new_map.clear(False)
for f in cmap:
    if f.getPeptideIdentifications() !=[]:
        new_map.push_back(f)
        
Consensus_file= os.path.join("results", "", "GNPSexport", "", "interim", "",'filtered_pyopenms' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, new_map)

file_descriptions = new_map.getColumnHeaders()

consensus_map= ConsensusMapDF()
for f in new_map:
    consensus_map.push_back(f)
    # get intensities as a DataFrame
    consensus_map.setColumnHeaders(file_descriptions)

    intensities = consensus_map.get_intensity_df()

    # get meta data as DataFrame
    meta_data = consensus_map.get_metadata_df()

    # you can concatenate these two for a "result" DataFrame
    result = pd.concat([meta_data, intensities], axis=1)

    # if you don't need labeled index, remove it (and/or save with index = False)
    result.reset_index(drop=True, inplace=True)

    idx = 0
    new_col = "CONSENSUS"  # can be a list, a Series, an array or a scalar   
    result.insert(loc=idx, column='#CONSENSUS', value=new_col)   
    result= result.rename(columns= {"charge": "charge_cf", "RT": "rt_cf", "mz": "mz_cf", "quality": "quality_cf", "width": "width_cf"})

    # store as tsv file
    result.to_csv('results/GNPSexport/FeatureQuantificationTable.txt', sep = '\t', index = False)

ConsensusXMLFile::store():  found 6688 invalid unique ids


In [5]:
filemeta= new_map.getColumnHeaders()
mapIDs = [k for k in filemeta.keys()]
filename= []
size=[]
label= []
for header in filemeta.values():
    files= header.filename
    sizes= header.size
    labels= header.label
    filename.append(files)
    size.append(sizes)
    label.append(labels)

dict = {'id': mapIDs, 'filename': filename,'label': label,'size': size}
DF= pd.DataFrame(dict)
idx = 0
new_col = "MAP"  # can be a list, a Series, an array or a scalar   
DF.insert(loc=idx, column='#MAP', value=new_col)   
DF

Unnamed: 0,#MAP,id,filename,label,size
0,MAP,0,results/interim/PCpeak_20210826_UMETAB219_POS_...,,2776
1,MAP,1,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1253
2,MAP,2,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1345
3,MAP,3,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1353
4,MAP,4,results/interim/PCpeak_20210826_UMETAB219_POS_...,,3290
5,MAP,5,results/interim/PCpeak_20210826_UMETAB219_POS_...,,2979
6,MAP,6,results/interim/PCpeak_20210826_UMETAB219_POS_...,,3070
7,MAP,7,results/interim/PCpeak_20210826_UMETAB219_POS_...,,3044
8,MAP,8,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1351
9,MAP,9,results/interim/PCpeak_20210826_UMETAB219_POS_...,,1026


In [74]:
Fquant= pd.read_csv('results/GNPSexport/FeatureQuantificationTable.txt', sep = '\t')
Fquant

Unnamed: 0,#CONSENSUS,charge_cf,rt_cf,mz_cf,quality_cf,width_cf,results/interim/PCpeak_20211006_UMETAB222_POS_41_DNPM_Plate-11_NBC_00843_blank.mzML,results/interim/PCpeak_PCpeak_20210827_UMETAB219_POS_FPY12_Plate-2_MDNAWGS14_rep1.mzML,results/interim/PCpeak_20210827_UMETAB219_POS_FPY12_Plate-2_MDNAWGS14_rep3.mzML,results/interim/PCpeak_20210827_UMETAB219_POS_FPY12_Plate-2_MDNAWGS14_blank.mzML,...,results/interim/PCpeak_PCpeak_20210827_UMETAB219_POS_ISP2_Plate-2_MDNAWGS14_blank.mzML,results/interim/PCpeak_PCpeak_20210827_UMETAB219_POS_DNPM_Plate-2_MDNAWGS14_rep1.mzML,results/interim/PCpeak_20211006_UMETAB222_POS_6_FPY12_Plate-11_NBC_00843_rep1.mzML,results/interim/PCpeak_20210826_UMETAB219_POS_FPY12_Plate-1_MDNAWGS11_rep1.mzML,results/interim/PCpeak_20210827_UMETAB219_POS_FPY12_Plate-2_MDNAWGS14_rep2.mzML,results/interim/PCpeak_20210826_UMETAB219_POS_DNPM_Plate-1_MDNAWGS11_rep2.mzML,results/interim/PCpeak_20210827_UMETAB219_POS_ISP2_Plate-2_MDNAWGS14_rep1.mzML,results/interim/PCpeak_20210826_UMETAB219_POS_ISP2_Plate-1_MDNAWGS11_rep3.mzML,results/interim/PCpeak_20210826_UMETAB219_POS_FPY12_Plate-1_MDNAWGS11_blank.mzML,results/interim/PCpeak_20210826_UMETAB219_POS_FPY12_Plate-1_MDNAWGS11_rep2.mzML
0,CONSENSUS,1.0,296.58182,393.22324,0.000446,0.0,1300701.0,5242951.0,4781326.0,8919085.0,...,3228996.0,9957446.0,17951070.0,22427880.0,6956869.0,13423620.0,2087019.0,3217179.0,30551950.0,23460470.0
1,CONSENSUS,1.0,141.10956,379.20752,0.000301,0.0,904083.3,2025389.0,2618161.0,2824778.0,...,3579418.0,6640277.0,3621176.0,8378248.0,2312600.0,9434231.0,4913860.0,3032641.0,12793220.0,8881861.0
2,CONSENSUS,1.0,79.80309,254.16121,0.003344,0.0,111351800.0,16124980.0,7042243.0,12672740.0,...,13681260.0,79692450.0,45399110.0,49578170.0,7442163.0,137001200.0,10477380.0,33497830.0,48087800.0,55878080.0
3,CONSENSUS,1.0,315.84265,407.23886,0.000433,0.0,848262.6,2733415.0,4937243.0,6982376.0,...,3818414.0,15664220.0,5336683.0,15748400.0,2697249.0,19377870.0,1094928.0,2050646.0,19822080.0,15444460.0
4,CONSENSUS,1.0,151.21921,328.13907,0.001450,0.0,6652861.0,8776934.0,10517510.0,19129270.0,...,42571260.0,8819608.0,26788750.0,50182540.0,9347569.0,53082510.0,13239720.0,12750610.0,55692720.0,48305970.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6682,CONSENSUS,1.0,144.46127,450.25598,0.000009,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,339178.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6683,CONSENSUS,1.0,364.67010,380.21814,0.000024,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,823081.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6684,CONSENSUS,1.0,283.77112,482.27225,0.000082,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6685,CONSENSUS,1.0,343.01074,399.22382,0.000128,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
cols= Fquant.columns
for col in cols:
    for i, path in enumerate(filename):
        if path== col:
            name= DF["id"][i]
            Fquant.rename(columns={col: name}, inplace=True)

df_flt= Fquant.filter(regex=fr"[0-9]").sort_index(axis=1)
cols_flt= df_flt.columns
for col_flt in cols_flt:
    for col in cols:
        if col == col_flt:
            col_new = "intensity_"+ str(col)
            Fquant.rename(columns={col: col_new}, inplace=True)


intensity_0
intensity_1
intensity_2
intensity_3
intensity_4
intensity_5
intensity_6
intensity_7
intensity_8
intensity_9
intensity_10
intensity_11
intensity_12
intensity_13
intensity_14
intensity_15
intensity_16
intensity_17
intensity_18
intensity_19
intensity_20
intensity_21
intensity_22
intensity_23
intensity_24
intensity_25
intensity_26
intensity_27
intensity_28
intensity_29
intensity_30
intensity_31
intensity_32
intensity_33
intensity_34
intensity_35
intensity_36
intensity_37
intensity_38
intensity_39
intensity_40
intensity_41
intensity_42
intensity_43
intensity_44
intensity_45
intensity_46
intensity_47
     #CONSENSUS  charge_cf      rt_cf      mz_cf  quality_cf  width_cf  \
0     CONSENSUS        1.0  296.58182  393.22324    0.000446       0.0   
1     CONSENSUS        1.0  141.10956  379.20752    0.000301       0.0   
2     CONSENSUS        1.0   79.80309  254.16121    0.003344       0.0   
3     CONSENSUS        1.0  315.84265  407.23886    0.000433       0.0   
4     CONSENSUS 

In [106]:
Fquant= Fquant.sort_index(axis=1)
cols = Fquant.columns
preordered = ["#CONSENSUS", "charge_cf", "rt_cf", "mz_cf", "quality_cf", "width_cf"]
new_cols = preordered + [c for c in Fquant.columns if c not in preordered]
new_df = Fquant.reindex(columns=new_cols)
new_df.to_csv('results/GNPSexport/FeatureQuantificationTable.txt', sep = '\t', index = False)
print(new_df)

     #CONSENSUS  charge_cf      rt_cf      mz_cf  quality_cf  width_cf  \
0     CONSENSUS        1.0  296.58182  393.22324    0.000446       0.0   
1     CONSENSUS        1.0  141.10956  379.20752    0.000301       0.0   
2     CONSENSUS        1.0   79.80309  254.16121    0.003344       0.0   
3     CONSENSUS        1.0  315.84265  407.23886    0.000433       0.0   
4     CONSENSUS        1.0  151.21921  328.13907    0.001450       0.0   
...         ...        ...        ...        ...         ...       ...   
6682  CONSENSUS        1.0  144.46127  450.25598    0.000009       0.0   
6683  CONSENSUS        1.0  364.67010  380.21814    0.000024       0.0   
6684  CONSENSUS        1.0  283.77112  482.27225    0.000082       0.0   
6685  CONSENSUS        1.0  343.01074  399.22382    0.000128       0.0   
6686  CONSENSUS        1.0  245.58907  361.20798    0.000035       0.0   

      intensity_0  intensity_1  intensity_10  intensity_11  ...  intensity_43  \
0       7443680.0   13535850.0