In [1]:
import numpy as np
import pandas as pd

Write a function for mgf reading into an object

In [2]:
def load_mgf(fname):
    '''Read the file into one huge list, without pre-defined format'''
    FIELDS = ('TITLE=', 'RTINSECONDS=', 'PEPMASS=', 'CHARGE=', 'SCANS=')
    
    def format_precursor(spectrum):
        #Cover for a case when there's no precursor intensity
        if ' ' in spectrum['PEPMASS']:
            spectrum['PEPMASS'] = [
                float(x) for x in spectrum['PEPMASS'].split(' ')
            ]
        else:
            spectrum['PEPMASS'] = [ float(spectrum['PEPMASS']),]

        #Check the polarity, which may or mmay not be given after the digits
        # "2+", "3-" etc
        polarityMultiplier = 1
        if spectrum['CHARGE'][-1] == '-':
            polarityMultiplier = -1

        if not spectrum['CHARGE'][-1].isnumeric():
            spectrum['CHARGE'] = polarityMultiplier * int( spectrum['CHARGE'][:-1] )
        else:
            spectrum['CHARGE'] = int( spectrum['CHARGE'] )
        
        return True
    
    def ms_data_to_df(spectrum):

        spectrum['ms_data'] = pd.DataFrame(
            spectrum['ms_data'],
            columns = ('m/z', 'Intensity')
        )

        return True
    
    spectraList = []
    with open(fname, 'r') as fh:
        state = False
        for line in fh:
            if line[0].isnumeric() and state == True:
                spectrum['ms_data'].append(
                    [ float(x) for x in line.rstrip().split(' ') ]
                    )
            elif 'BEGIN IONS' in line:
                spectrum = {'ms_data': []}
                state = True
            elif 'END IONS' in line:
                #Do not add the spectrum to the list if it doesn't contain fragment masses
                if len(spectrum['ms_data']) > 0:

                    ms_data_to_df(spectrum)
                    format_precursor(spectrum)
                    
                    spectraList.append(spectrum)
                state = False
            else:
                for fieldName in FIELDS:
                    if fieldName in line and state == True:
                        spectrum[fieldName[:-1] ] = line.rstrip().split(fieldName)[1]
                
    return spectraList

In [3]:
fname = 'Yeast_1000spectra.mgf'

In [4]:
%%timeit -r 20
load_mgf(fname)

290 ms ± 7.45 ms per loop (mean ± std. dev. of 20 runs, 1 loop each)


In [5]:
res = load_mgf(fname)
len(res)

1000

In [6]:
res[500]

{'ms_data':            m/z   Intensity
 0   172.973053  262.676544
 1   175.068008  117.924858
 2   189.044006  170.185852
 3   201.158005  120.846565
 4   221.101013  163.982086
 ..         ...         ...
 95  848.857056   65.001671
 96  849.547058  639.374023
 97  850.563049  558.017273
 98  876.655090  116.027573
 99  919.621033   64.954643
 
 [100 rows x 2 columns],
 'TITLE': 'Fusion_180828_07.13836.13836.2 File:"Fusion_180828_07.raw", NativeID:"controllerType=0 controllerNumber=1 scan=13836"',
 'RTINSECONDS': '2463.272073',
 'PEPMASS': [525.756286621094, 4285863.743134],
 'CHARGE': 2}

In [7]:
res[500]['ms_data']

Unnamed: 0,m/z,Intensity
0,172.973053,262.676544
1,175.068008,117.924858
2,189.044006,170.185852
3,201.158005,120.846565
4,221.101013,163.982086
...,...,...
95,848.857056,65.001671
96,849.547058,639.374023
97,850.563049,558.017273
98,876.655090,116.027573


Let's now check each spectrum for known mass differences

Monoisotopic masses of amino acids:

In [8]:
AA_DELTAS = {
    'G': 57.02147, 'A': 71.03712, 'S': 87.03203, 'P': 97.05277, 'V': 99.06842, 
    'T': 101.04768, 'Ccam': 160.03065, 'Cmes': 148.996912, 'I/L': 113.08407,
    'N': 114.04293, 'D': 115.02695, 'Q': 128.05858, 'K': 128.09497, 'E': 129.0426,
    'M': 131.04049, 'Mox': 147.0354, 'H': 137.05891, 'F': 147.06842, 'R': 156.10112,
    'Y': 163.06333, 'W': 186.07932
}

In [9]:
#Flatten the values from the dictionary
singleResDeltas = np.array(
    list( AA_DELTAS.values() ), dtype = 'float64'
)
print(singleResDeltas.dtype)
#Add doubly-charged and triply-charged mass Deltas (simply divide by 2 and 3)
singleResDeltas = np.concatenate(
    (
        singleResDeltas,
        singleResDeltas / 2,
        singleResDeltas / 3
    )
)
print(singleResDeltas.shape)
singleResDeltas[:5]

float64
(63,)


array([57.02147, 71.03712, 87.03203, 97.05277, 99.06842])

Now take the spectra one-by-one, find pairwise mass differences and match them to the list.<br>
* Calculate pairwise absolute differences between the 
* Subtract the experimental mass Deltas from the theoretical
* Calculate relative difference
* Select the cases with the relative difference lower than threshold (matches)
* Summarize and report the matches

In [10]:
def find_matches(spectra, masses_to_match, rel_tolerance = 1e-5, float_arr_type = 'float64'):
    resDict = {
        'Spectrum_idx': np.array([], dtype='uint32'),
        'Exp_idx': np.array([], dtype='uint32'),
        'Library_idx': np.array([], dtype='uint32'),
        'Rel_error': np.array([], dtype=float_arr_type)
    }
    #Calculate the minimal value in the list for matching
    #and offset it by the matching tolerance
    minTheoVal = masses_to_match.min() * (1 - rel_tolerance)

    for idx, s in enumerate(spectra):
        #Calculate pairwise differences betweeen experimental values
        expDeltas = np.subtract.outer(
            s['ms_data']['m/z'].to_numpy(), s['ms_data']['m/z'].to_numpy()
        )            
        # Disregard relative deltas that are smaller than the lowest theoretical value
        expDeltas = expDeltas[ expDeltas > minTheoVal]
        # Calculate relative differences between experimental and theoretical values 
        relDeltasArr = np.divide(
            #Absolute values of the differences between masses
            np.abs(
                np.subtract.outer(
                    masses_to_match, expDeltas
                )
            ),
            #Means between the masses
            (np.add.outer(masses_to_match, expDeltas) / 2)
        )
        matchingInds = np.where(
            pd.DataFrame(
                relDeltasArr
            ).le(rel_tolerance) == True
        )
        numMatches = matchingInds[0].shape[0]
        if numMatches > 0:
            resDict['Spectrum_idx'] = np.append(
                resDict['Spectrum_idx'],
                np.array( [idx, ] * numMatches, dtype='uint32' )
            )
            resDict['Library_idx'] = np.append(
                resDict['Library_idx'], matchingInds[0]
            )
            resDict['Exp_idx'] = np.append(
                resDict['Exp_idx'], matchingInds[1]
            )
            resDict['Rel_error'] = np.append(
                resDict['Rel_error'],
                relDeltasArr[ matchingInds[0], matchingInds[1] ]
            )

    resDF = pd.DataFrame(resDict)
    return resDF

In [11]:
%%timeit -r 5
find_matches(res, singleResDeltas, 1e-5)

2.49 s ± 135 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [12]:
matches = find_matches(res, singleResDeltas, rel_tolerance = 1e-5)
matches

Unnamed: 0,Spectrum_idx,Exp_idx,Library_idx,Rel_error
0,0,2545,16,8.098704e-07
1,1,2552,0,8.814185e-06
2,1,512,10,5.042295e-07
3,1,2289,14,4.011729e-06
4,1,2552,30,8.901871e-06
...,...,...,...,...
1419,994,2060,44,8.570371e-06
1420,997,1829,19,3.892958e-06
1421,997,1075,52,7.985117e-06
1422,999,198,28,6.310218e-06


In [13]:
matches[ matches['Spectrum_idx'] == 1 ]

Unnamed: 0,Spectrum_idx,Exp_idx,Library_idx,Rel_error
1,1,2552,0,8.814185e-06
2,1,512,10,5.042295e-07
3,1,2289,14,4.011729e-06
4,1,2552,30,8.901871e-06
5,1,431,33,7.617814e-06
6,1,2399,48,8.897006e-06
