Skip to content

Commit

Permalink
fasta2speclib: Improved algorithm for variable modification assignment:
Browse files Browse the repository at this point in the history
- Fixes issue with duplicate entries
- Reduces combinatorial explosion by setting maximum of variable modifications per peptide, instead of maximum of modifiable sites per peptide.

MSP writing fixes for DIA-NN compatibility:
-  Write m/z error of 0.0 for each predicted peak into peak annotation string
- Sort modifications in MSP `Mods` field
- Use `RetentionTime` instead of `RTINSECONDS` in comments field
  • Loading branch information
RalfG committed Jan 26, 2023
1 parent b2a3d0c commit 6f41d44
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 82 deletions.
194 changes: 114 additions & 80 deletions fasta2speclib/fasta2speclib.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,21 @@
import json
import logging
import multiprocessing
from itertools import combinations
from itertools import product
from math import ceil

# Third party libraries
import numpy as np
import pandas as pd
from pyteomics import mass
from pyteomics.parser import cleave, expasy_rules
from Bio import SeqIO
from pyteomics.parser import cleave

from ms2pip.ms2pip_tools import spectrum_output
from ms2pip.ms2pip_tools.get_elude_predictions import get_elude_predictions

# MS2PIP
from ms2pip.ms2pipC import MS2PIP
from ms2pip.retention_time import RetentionTime
from ms2pip.ms2pip_tools import spectrum_output
from ms2pip.ms2pip_tools.get_elude_predictions import get_elude_predictions


def ArgParse():
Expand Down Expand Up @@ -78,7 +78,10 @@ def get_params():
params = json.load(config_file)

params.update(
{"fasta_filename": args.fasta_filename, "log_level": logging.INFO,}
{
"fasta_filename": args.fasta_filename,
"log_level": logging.INFO,
}
)

if args.output_filename:
Expand All @@ -96,6 +99,7 @@ def get_params():

def prot_to_peprec(protein):
"""Cleave protein and return pd.DataFrame with valid peptides."""

def validate_peptide(peptide, min_length, max_length):
"""Validate peptide by length and amino acids."""
peplen = len(peptide)
Expand Down Expand Up @@ -142,80 +146,112 @@ def get_protein_list(df):
return df


def get_modifications_by_target(modifications):
"""Restructure modifications configuration to options per side chain or terminus."""
mods_sidechain = {}
mods_nterm = {}

for mod in modifications:
if mod["fixed"]:
continue
elif mod["n_term"]:
if mod["amino_acid"]:
if mod["amino_acid"] in mods_nterm:
mods_nterm[mod["amino_acid"]].append(mod["name"])
else:
mods_nterm[mod["amino_acid"]] = [mod["name"]]
else:
if "any" in mods_nterm:
mods_nterm["any"].append(mod["name"])
else:
mods_nterm["any"] = [mod["name"]]
elif mod["amino_acid"]:
if mod["amino_acid"] in mods_sidechain:
mods_sidechain[mod["amino_acid"]].append(mod["name"])
else:
mods_sidechain[mod["amino_acid"]] = [mod["name"]]

for aa, mods in mods_sidechain.items():
mods.append(None)
if "any" in mods_nterm:
mods_nterm["any"].append(None)
else:
mods_nterm["any"] = [None]

return mods_sidechain, mods_nterm


def get_modification_versions(
peptide, modifications, mods_sidechain, mods_nterm, max_mods=3
):
"""Get MS²PIP modification strings for all potential versions."""
possibilities_by_site = dict()

# Assign possible modifications per residue (side chains)
for pos, aa in enumerate(peptide):
if aa in mods_sidechain:
possibilities_by_site[pos + 1] = mods_sidechain[aa]

# Assign possible modifications for N terminus
if mods_nterm:
possibilities_by_site[0] = []
for site, mods in mods_nterm.items():
if site == "any":
possibilities_by_site[0].extend(mods)
elif peptide[0] == site:
possibilities_by_site[0].extend(mods)

# Override with fixed modifications
for mod in modifications:
if not mod["fixed"]:
continue
if mod["n_term"]:
if (mod["amino_acid"] and peptide[0] == mod["amino_acid"]) or not mod[
"amino_acid"
]:
possibilities_by_site[0] = [mod["name"]]
elif mod["amino_acid"]:
for pos, aa in enumerate(peptide):
if aa == mod["amino_acid"]:
possibilities_by_site[pos] = [mod["name"]]

# Get all possible combinations of modifications for all sites
mod_permutations = list(product(*possibilities_by_site.values()))
mod_positions = possibilities_by_site.keys()

# Get MS²PIP modifications strings for each combination
mod_strings = []
for perm in mod_permutations:
mods = sorted(zip(mod_positions, perm)) # Zip permutations with positions
mods = "|".join(
f"{m[0]}|{m[1]}" for m in mods if m[1]
) # Make str for modified sites
mod_strings.append(mods)

# Filter by max modified sites (avoiding combinatorial explosion)
mod_strings = list(
filter(lambda x: (x.count("|") + 1) / 2 <= max_mods, mod_strings)
)

return mod_strings


def add_mods(tup):
"""
See fasta2speclib_config.md for more information.
"""
_, row = tup
params = get_params()
mod_versions = [dict()]

# First add all fixed modifications
for mod in params["modifications"]:
if mod["fixed"]:
if not mod["n_term"] and mod["amino_acid"]:
mod_versions[0].update(
{
i: mod["name"]
for i, aa in enumerate(row["peptide"])
if aa == mod["amino_acid"]
}
)
elif mod["n_term"]:
if mod["amino_acid"]:
if row["peptide"][0] == mod["amino_acid"]:
mod_versions[0]["N"] = mod["name"]
else:
mod_versions[0]["N"] = mod["name"]

# Continue with variable modifications
for mod in params["modifications"]:
if mod["fixed"]:
continue

# List all positions with specific amino acid, to avoid combinatorial explotion,
# limit to 4 positions
all_pos = [i for i, aa in enumerate(row["peptide"]) if aa == mod["amino_acid"]]
if len(all_pos) > 4:
all_pos = all_pos[:4]
for version in mod_versions:
# For non-position-specific mods:
if not mod["n_term"]:
pos = [p for p in all_pos if p not in version.keys()]
combos = [
x for l in range(1, len(pos) + 1) for x in combinations(pos, l)
]
for combo in combos:
new_version = version.copy()
for pos in combo:
new_version[pos] = mod["name"]
mod_versions.append(new_version)

# For N-term mods and N-term is not yet modified:
elif mod["n_term"] and "N" not in version.keys():
# N-term with specific first AA:
if mod["amino_acid"]:
if row["peptide"][0] == mod["amino_acid"]:
new_version = version.copy()
new_version["N"] = mod["name"]
mod_versions.append(new_version)
# N-term without specific first AA:
else:
new_version = version.copy()
new_version["N"] = mod["name"]
mod_versions.append(new_version)
# TODO: Do not hardcode max_mods
# TODO: Do not include fixed modifications in max_mods
mods_sidechain, mods_nterm = get_modifications_by_target(params["modifications"])
mod_versions = get_modification_versions(
row["peptide"], params["modifications"], mods_sidechain, mods_nterm, max_mods=3
)

df_out = pd.DataFrame(columns=row.index)
df_out["modifications"] = [
"|".join(
"{}|{}".format(0, value) if key == "N" else "{}|{}".format(key + 1, value)
for key, value in version.items()
)
for version in mod_versions
]
df_out["modifications"] = [
"-" if not mods else mods for mods in df_out["modifications"]
]
df_out["modifications"] = ["-" if not mods else mods for mods in mod_versions]
df_out["spec_id"] = [
"{}_{:03d}".format(row["spec_id"], i) for i in range(len(mod_versions))
]
Expand Down Expand Up @@ -326,7 +362,9 @@ def run_batches(peprec, decoy=False):
"frag_error": 0.02,
# Modify fasta2speclib modifications dict to MS2PIP params PTMs entry
"ptm": [
"{},{},opt,{}".format(mods["name"], mods["mass_shift"], mods["amino_acid"])
"{},{},opt,{}".format(
mods["name"], mods["mass_shift"], mods["amino_acid"]
)
if not mods["n_term"]
else "{},{},opt,N-term".format(mods["name"], mods["mass_shift"])
for mods in params["modifications"]
Expand Down Expand Up @@ -367,7 +405,7 @@ def run_batches(peprec, decoy=False):
with multiprocessing.Pool(params["num_cpu"]) as p:
peprec_mods = pd.concat(
[peprec_mods] + p.map(add_mods, peprec_batch.iterrows()),
ignore_index=True
ignore_index=True,
)
peprec_batch = peprec_mods

Expand Down Expand Up @@ -486,9 +524,8 @@ def main():
logging.info("Cleaving proteins, adding peptides to peprec")
with multiprocessing.Pool(params["num_cpu"]) as p:
peprec = pd.concat(
[peprec] + p.map(
prot_to_peprec, SeqIO.parse(params["fasta_filename"], "fasta")
),
[peprec]
+ p.map(prot_to_peprec, SeqIO.parse(params["fasta_filename"], "fasta")),
ignore_index=True,
)

Expand Down Expand Up @@ -519,9 +556,6 @@ def main():

run_batches(peprec, decoy=False)

# For testing
# peprec_nonmod = pd.read_hdf('data/uniprot_proteome_yeast_head_nonexpanded.peprec.hdf', key='table')

if params["decoy"]:
logging.info("Reversing sequences for decoy peptides")
peprec_decoy = create_decoy_peprec(peprec_nonmod, move_mods=False)
Expand All @@ -530,7 +564,7 @@ def main():
logging.info("Predicting spectra for decoy peptides")
run_batches(peprec_decoy, decoy=True)

logging.info("Fasta2SpecLib is ready!")
logging.info("fasta2speclib is ready!")


if __name__ == "__main__":
Expand Down
5 changes: 3 additions & 2 deletions ms2pip/ms2pip_tools/spectrum_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def _get_peak_string(
all_peaks.append(
(
peak[1],
f'{peak[1]:.6f}{sep}{intensity_type(peak[2])}{sep}"{ion_type.lower()}{peak[0]}"',
f'{peak[1]:.6f}{sep}{intensity_type(peak[2])}{sep}"{ion_type.lower()}{peak[0]}/0.0"',
)
)
else:
Expand All @@ -258,6 +258,7 @@ def _get_msp_modifications(self, sequence, modifications):
mods = modifications.split("|")
mods = [(int(mods[i]), mods[i + 1]) for i in range(0, len(mods), 2)]
mods = [(x, y) if x == 0 else (x - 1, y) for (x, y) in mods]
mods = sorted(mods)
mods = [(str(x), sequence[x], y) for (x, y) in mods]
msp_modifications = "/".join([",".join(list(x)) for x in mods])
msp_modifications = f"{len(mods)}/{msp_modifications}"
Expand Down Expand Up @@ -356,7 +357,7 @@ def write_msp(self, file_object):

if self.has_rt:
rt = self.peprec_dict[spec_id]["rt"]
comment_line += f" RTINSECONDS={rt}"
comment_line += f" RetentionTime={rt}"

comment_line += f' MS2PIP_ID="{spec_id}"'

Expand Down

0 comments on commit 6f41d44

Please sign in to comment.