In [1]:
using DataFrames
using BenchmarkTools

Implement the logic for reading mgf files.<br>
Try to reproduce the Python function as close as possible

In [2]:
function loadmgf(fname)
    FIELDS = ["TITLE=", "RTINSECONDS=", "PEPMASS=", "CHARGE=", "SCANS="]
    
    function format_precursor!(spectrum)
        if occursin( " ", spectrum["PEPMASS"] )
            spectrum["PEPMASS"] = map( x -> tryparse(Float64, x), split( spectrum["PEPMASS"] , " " ) )
        else
            spectrum["PEPMASS"] = [ tryparse(Float64, spectrum["PEPMASS"] ) ]
        end
        
        local polarity_multiplier = 1
        if spectrum["CHARGE"][end] == '-'
            polarity_multiplier = -1
        end
        
        if isnumeric( spectrum["CHARGE"][end] ) == false
            spectrum["CHARGE"] = polarity_multiplier * tryparse(Int32, spectrum["CHARGE"][1:end-1] )
        else
            spectrum["CHARGE"] = tryparse(Int32, spectrum["CHARGE"])
        end
        return true
    end
    
    function msdata_to_df!(spectrum)
        spectrum["ms_data"] = DataFrame( spectrum["ms_data"] )
        return true
    end
    
    local spectrum
    spectra_array = []
    open(fname) do fh
        state = false
        for line::String in eachline(fh)
            
            if length(line) > 0 && isnumeric( line[1] ) && state == true
                num_line = map( x -> tryparse(Float64, x), split(line, " ") )
                push!(
                    spectrum["ms_data"]["m/z"],
                    num_line[1]
                )
                push!(
                    spectrum["ms_data"]["Intensity"],
                    num_line[2]
                )
            elseif occursin("BEGIN IONS", line)
                spectrum = Dict{String,Any}( [ "ms_data" => Dict(
                            [
                                "m/z"=> Array{Float64}(undef, 0),
                                "Intensity"=> Array{Float64}(undef, 0)
                            ]
                        ) ] )
                state = true
                
            elseif occursin("END IONS", line)
                #println(spectrum)
                if length( spectrum["ms_data"] ) > 0
                    format_precursor!(spectrum)
                    msdata_to_df!(spectrum)
                    push!(spectra_array, spectrum)
                end
                state = false
            else
                for field_name in FIELDS
                    if occursin(field_name, line) && state == true
                        spectrum[ field_name[1:end-1] ] = split(line, field_name)[2]
                    end
                end
            end
        end
    end
    spectra_array
end

loadmgf (generic function with 1 method)

In [3]:
fname = "Yeast_1000spectra.mgf"

"Yeast_1000spectra.mgf"

In [4]:
res = loadmgf(fname)
length(res)

1000

In [5]:
res[501]

Dict{String,Any} with 5 entries:
  "TITLE"       => "Fusion_180828_07.13836.13836.2 File:\"Fusion_180828_07.raw\…
  "RTINSECONDS" => "2463.272073"
  "CHARGE"      => 2
  "ms_data"     => [1m100×2 DataFrame[0m…
  "PEPMASS"     => [525.756, 4.28586e6]

In [6]:
res[501]["ms_data"]

Unnamed: 0_level_0,Intensity,m/z
Unnamed: 0_level_1,Float64,Float64
1,262.677,172.973
2,117.925,175.068
3,170.186,189.044
4,120.847,201.158
5,163.982,221.101
6,66.9147,221.977
7,131.333,249.024
8,117.201,249.966
9,107.119,259.098
10,116.108,274.28


In [8]:
@benchmark loadmgf(fname)

BenchmarkTools.Trial: 
  memory estimate:  48.80 MiB
  allocs estimate:  911621
  --------------
  minimum time:     108.247 ms (6.60% GC)
  median time:      112.938 ms (5.28% GC)
  mean time:        114.375 ms (5.78% GC)
  maximum time:     126.844 ms (4.72% GC)
  --------------
  samples:          44
  evals/sample:     1

Let's now check each spectrum for known mass differences

Monoisotopic masses of amino acids:

In [9]:
AA_DELTAS = Dict(
    "G"=> 57.02147, "A"=> 71.03712, "S"=> 87.03203, "P"=> 97.05277, "V"=> 99.06842,
    "T"=> 101.04768, "Ccam"=> 160.03065, "Cmes"=> 148.996912, "I/L"=> 113.08407,
    "N"=> 114.04293, "D"=> 115.02695, "Q"=> 128.05858, "K"=> 128.09497, "E"=> 129.0426,
    "M"=> 131.04049, "Mox"=> 147.0354, "H"=> 137.05891, "F"=> 147.06842, "R"=> 156.10112,
    "Y"=> 163.06333, "W"=> 186.07932
)

Dict{String,Float64} with 21 entries:
  "Q"    => 128.059
  "Mox"  => 147.035
  "T"    => 101.048
  "P"    => 97.0528
  "W"    => 186.079
  "V"    => 99.0684
  "M"    => 131.04
  "I/L"  => 113.084
  "N"    => 114.043
  "H"    => 137.059
  "A"    => 71.0371
  "D"    => 115.027
  "G"    => 57.0215
  "E"    => 129.043
  "Ccam" => 160.031
  "Y"    => 163.063
  "S"    => 87.032
  "Cmes" => 148.997
  "K"    => 128.095
  "R"    => 156.101
  "F"    => 147.068

In [10]:
#Flatten the values from the dictionary
single_res_Δ = collect( values(AA_DELTAS) )
println( typeof(single_res_Δ) )
#Add doubly-charged and triply-charged mass Deltas (simply divide by 2 and 3)
single_res_Δ = vcat( single_res_Δ / 3, single_res_Δ / 2, single_res_Δ )
println( length( single_res_Δ ) )
single_res_Δ[1:5]

Array{Float64,1}
63


5-element Array{Float64,1}:
 42.686193333333335
 49.0118
 33.68256
 32.350923333333334
 62.02644

Now take the spectra one-by-one, find pairwise mass differences and match them to the list.<br>
Reproduce the logic from the Python function as closely as possible:<br>
* Calculate pairwise absolute differences between the 
* Subtract the experimental mass Deltas from the theoretical
* Calculate relative difference
* Select the cases with the relative difference lower than threshold (matches)
* Summarize and report the matches

In [11]:
function matches(spectra, masses_to_match, rel_tolerance)
    res_dict = Dict([
        ( "Spectrum_idx", Vector{Int64}() ),
        ( "Exp_idx", Vector{Int64}() ),
        ( "Library_idx", Vector{Int64}() ),
        ( "Rel_error", Vector{Float64}() )
    ])
    min_theo_val = minimum(masses_to_match) * (1 - rel_tolerance)
   for (idx, s) in enumerate(spectra)
        exp_Δs = filter(
            x -> x > min_theo_val,
            s["ms_data"][!, "m/z"] .- s["ms_data"][!, "m/z"]'
        )
        rel_deltas_matrix = map(
            abs,
            (masses_to_match .- exp_Δs')
        ) * 2 ./ (masses_to_match .+ exp_Δs')
        matching_inds = findall(x -> isless(x, rel_tolerance), rel_deltas_matrix)
        num_matches = size(matching_inds)[1]
        if num_matches > 0
            append!( res_dict["Spectrum_idx"], fill(idx, num_matches) )
            append!( res_dict["Library_idx"], map(x -> x[1], matching_inds) )
            append!( res_dict["Exp_idx"], map(x -> x[2], matching_inds) )
            append!( res_dict["Rel_error"], rel_deltas_matrix[matching_inds] )
        end
    end
    DataFrame(res_dict)
end

matches (generic function with 1 method)

In [12]:
m = matches(res, single_res_Δ, 1e-5)
size(m)

(1424, 4)

In [13]:
@benchmark matches(res, single_res_Δ, 1e-5) samples=5

BenchmarkTools.Trial: 
  memory estimate:  5.84 GiB
  allocs estimate:  34240
  --------------
  minimum time:     1.604 s (8.30% GC)
  median time:      1.787 s (7.45% GC)
  mean time:        1.734 s (7.71% GC)
  maximum time:     1.812 s (7.14% GC)
  --------------
  samples:          3
  evals/sample:     1

Use of the dot macro to fuse several matrix operations into one

In [14]:
function matchesfuse(spectra, masses_to_match, rel_tolerance)
    res_dict = Dict([
        ( "Spectrum_idx", Vector{Int64}() ),
        ( "Exp_idx", Vector{Int64}() ),
        ( "Library_idx", Vector{Int64}() ),
        ( "Rel_error", Vector{Float64}() )
    ])
    min_theo_val = minimum(masses_to_match) * (1 - rel_tolerance)
   for (idx, s) in enumerate(spectra)
        exp_Δs = filter(
            x -> x > min_theo_val,
            s["ms_data"][!, "m/z"] .- s["ms_data"][!, "m/z"]'
        )
        #Fuse the vectorized operations using the dot macro
        rel_deltas_matrix = @. map(abs, (masses_to_match - exp_Δs') ) * 2 / (masses_to_match + exp_Δs')
        matching_inds = findall(x -> isless(x, rel_tolerance), rel_deltas_matrix)
        num_matches = size(matching_inds)[1]
        if num_matches > 0
            append!( res_dict["Spectrum_idx"], fill(idx, num_matches) )
            append!( res_dict["Library_idx"], map(x -> x[1], matching_inds) )
            append!( res_dict["Exp_idx"], map(x -> x[2], matching_inds) )
            append!( res_dict["Rel_error"], rel_deltas_matrix[matching_inds] )
        end
    end
    DataFrame(res_dict)
end

matchesfuse (generic function with 1 method)

In [15]:
m = matchesfuse(res, single_res_Δ, 1e-5)
size(m)

(1424, 4)

In [16]:
m[ m.Spectrum_idx .== 2 , :]

Unnamed: 0_level_0,Exp_idx,Library_idx,Rel_error,Spectrum_idx
Unnamed: 0_level_1,Int64,Int64,Float64,Int64
1,693,54,5.0423e-07,2
2,1368,40,7.61781e-06,2
3,2200,49,4.01173e-06,2
4,4006,15,8.89701e-06,2
5,4085,30,8.90187e-06,2
6,4085,55,8.81418e-06,2


In [17]:
@benchmark matchesfuse(res, single_res_Δ, 1e-5) samples=5

BenchmarkTools.Trial: 
  memory estimate:  1.54 GiB
  allocs estimate:  34246
  --------------
  minimum time:     1.036 s (3.13% GC)
  median time:      1.078 s (3.21% GC)
  mean time:        1.071 s (3.31% GC)
  maximum time:     1.087 s (3.72% GC)
  --------------
  samples:          5
  evals/sample:     1