diff --git a/metaboblend/algorithms.py b/metaboblend/algorithms.py
new file mode 100644
index 0000000..624b00a
--- /dev/null
+++ b/metaboblend/algorithms.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019-2020 Ralf Weber
+#
+# This file is part of MetaboBlend.
+#
+# MetaboBlend is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# MetaboBlend is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with MetaboBlend. If not, see .
+#
+
+import numpy
+
+
+def find_path(mass_list, sum_matrix, n, mass, max_subset_length, path=[]):
+ """
+ Recursive solution for backtracking through the dynamic programming boolean matrix. All possible subsets are found
+
+ :param mass_list: A list of masses from which to identify subsets.
+
+ :param mass: The target mass of the sum of the substructures.
+
+ :param sum_matrix: The dynamic programming boolean matrix.
+
+ :param n: The size of mass_list.
+
+ :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to
+ terminate early in many cases, significantly improving runtime.
+
+ :param path: List for keeping track of the current subset.
+
+ :return: Generates of lists containing the masses of valid subsets.
+ """
+
+ # base case - the path has generated a correct solution
+ if mass == 0:
+ yield sorted(path)
+ return
+
+ # stop running when we overshoot the mass
+ elif mass < 0:
+ return
+
+ # can we sum up to the target value using the remaining masses? recursive call
+ elif sum_matrix[n][mass]:
+ yield from find_path(mass_list, sum_matrix, n - 1, mass, max_subset_length, path)
+
+ if len(path) < max_subset_length:
+ path.append(mass_list[n-1])
+
+ yield from find_path(mass_list, sum_matrix, n - 1, mass - mass_list[n - 1], max_subset_length, path)
+ path.pop()
+
+
+def subset_sum(mass_list, mass, max_subset_length=3):
+ """
+ Dynamic programming implementation of subset sum. Note that, whilst this algorithm is pseudo-polynomial, the
+ backtracking algorithm for obtaining all possible subsets has exponential complexity and so remains unsuitable
+ for large input values. This does, however, tend to perform a lot better than non-sum_matrix implementations, as
+ we're no longer doing sums multiple times and we've cut down the operations performed during the exponential portion
+ of the method.
+
+ :param mass_list: A list of masses from which to identify subsets.
+
+ :param mass: The target mass of the sum of the substructures.
+
+ :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to
+ terminate early in many cases, significantly improving runtime.
+
+ :return: Generates of lists containing the masses of valid subsets.
+ """
+
+ n = len(mass_list)
+
+ # initialise dynamic programming array
+ sum_matrix = numpy.ndarray([n + 1, mass + 1], bool)
+
+ # subsets can always equal 0
+ for i in range(n+1):
+ sum_matrix[i][0] = True
+
+ # empty subsets do not have non-zero sums
+ for i in range(mass):
+ sum_matrix[0][i + 1] = False
+
+ # fill in the remaining boolean matrix
+ for i in range(n):
+ for j in range(mass+1):
+ if j >= mass_list[i]:
+ sum_matrix[i + 1][j] = sum_matrix[i][j] or sum_matrix[i][j - mass_list[i]]
+ else:
+ sum_matrix[i + 1][j] = sum_matrix[i][j]
+
+ # backtrack through the matrix recursively to obtain all solutions
+ return find_path(mass_list, sum_matrix, n, mass, max_subset_length)
diff --git a/metaboblend/auxiliary.py b/metaboblend/auxiliary.py
index 52717b8..4408224 100644
--- a/metaboblend/auxiliary.py
+++ b/metaboblend/auxiliary.py
@@ -20,8 +20,8 @@
#
import itertools
-import networkx as nx
import pylab as plt
+import networkx as nx
def calculate_complete_multipartite_graphs(max_atoms_available, max_n_substructures):
diff --git a/metaboblend/build_structures.py b/metaboblend/build_structures.py
index 1fa4539..16987dc 100644
--- a/metaboblend/build_structures.py
+++ b/metaboblend/build_structures.py
@@ -20,103 +20,21 @@
#
import os
-import multiprocessing
import copy
+import numpy
import itertools
-from functools import partial
+import multiprocessing
import networkx as nx
-import numpy
-import sqlite3
-import csv
+from functools import partial
from operator import itemgetter
from typing import Sequence, Dict, Union
from rdkit import Chem
-from .databases import SubstructureDb, get_elements, calculate_exact_mass
-
-
-def find_path(mass_list, sum_matrix, n, mass, max_subset_length, path=[]):
- """
- Recursive solution for backtracking through the dynamic programming boolean matrix. All possible subsets are found
-
- :param mass_list: A list of masses from which to identify subsets.
-
- :param mass: The target mass of the sum of the substructures.
-
- :param sum_matrix: The dynamic programming boolean matrix.
-
- :param n: The size of mass_list.
-
- :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to
- terminate early in many cases, significantly improving runtime.
-
- :param path: List for keeping track of the current subset.
-
- :return: Generates of lists containing the masses of valid subsets.
- """
-
- # base case - the path has generated a correct solution
- if mass == 0:
- yield sorted(path)
- return
-
- # stop running when we overshoot the mass
- elif mass < 0:
- return
-
- # can we sum up to the target value using the remaining masses? recursive call
- elif sum_matrix[n][mass]:
- yield from find_path(mass_list, sum_matrix, n - 1, mass, max_subset_length, path)
-
- if len(path) < max_subset_length:
- path.append(mass_list[n-1])
-
- yield from find_path(mass_list, sum_matrix, n - 1, mass - mass_list[n - 1], max_subset_length, path)
- path.pop()
-
-
-def subset_sum(mass_list, mass, max_subset_length=3):
- """
- Dynamic programming implementation of subset sum. Note that, whilst this algorithm is pseudo-polynomial, the
- backtracking algorithm for obtaining all possible subsets has exponential complexity and so remains unsuitable
- for large input values. This does, however, tend to perform a lot better than non-sum_matrix implementations, as
- we're no longer doing sums multiple times and we've cut down the operations performed during the exponential portion
- of the method.
-
- :param mass_list: A list of masses from which to identify subsets.
-
- :param mass: The target mass of the sum of the substructures.
-
- :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to
- terminate early in many cases, significantly improving runtime.
-
- :return: Generates of lists containing the masses of valid subsets.
- """
-
- n = len(mass_list)
-
- # initialise dynamic programming array
- sum_matrix = numpy.ndarray([n + 1, mass + 1], bool)
-
- # subsets can always equal 0
- for i in range(n+1):
- sum_matrix[i][0] = True
-
- # empty subsets do not have non-zero sums
- for i in range(mass):
- sum_matrix[0][i + 1] = False
-
- # fill in the remaining boolean matrix
- for i in range(n):
- for j in range(mass+1):
- if j >= mass_list[i]:
- sum_matrix[i + 1][j] = sum_matrix[i][j] or sum_matrix[i][j - mass_list[i]]
- else:
- sum_matrix[i + 1][j] = sum_matrix[i][j]
-
- # backtrack through the matrix recursively to obtain all solutions
- return find_path(mass_list, sum_matrix, n, mass, max_subset_length)
+from .results import ResultsDb
+from .parse import parse_ms_data
+from .algorithms import subset_sum
+from .databases import SubstructureDb
def combine_mfs(precise_mass_grp, db, table_name, accuracy):
@@ -236,6 +154,9 @@ def add_bonds(mols, edges, atoms_available, bond_types, bond_enthalpies):
* **2.0** Double
+ :param bond_enthalpies: Dictionary of bond enthalpies, as generated by
+ :py:meth:`metaboblend.build_structures.get_bond_enthalpies`.
+
:return: If unsuccessful, returns None, else returns an :py:meth:`rdkit.Chem.EditableMol` object containing
the substructures combined into a final single molecule.
"""
@@ -275,11 +196,12 @@ def add_bonds(mols, edges, atoms_available, bond_types, bond_enthalpies):
bt_start.remove(bond_matches[0])
bt_end.remove(bond_matches[0])
- try:
+ try: # try forming the specified bond
mol_edit.AddBond(edge[0], edge[1], rdkit_bond_types[bond_matches[0]])
except KeyError:
return None, None # unknown bond type
+ # calculate bond dissociation energy of "formed" bonds for the structure
try:
total_bde += bond_enthalpies[bond_matches[0]][mols.GetAtomWithIdx(edge[0]).GetSymbol()][mols.GetAtomWithIdx(edge[1]).GetSymbol()]
except (SyntaxError, TypeError):
@@ -288,285 +210,7 @@ def add_bonds(mols, edges, atoms_available, bond_types, bond_enthalpies):
return mol_edit, total_bde
-class ResultsDb:
- """
- Methods for interacting with the SQLITE3 results database, as created by
- :py:meth:`metaboblend.build_structures.annotate_msn`.
-
- :param path_results: Directory to which results will be written.
- """
-
- def __init__(self, path_results, msn=True):
- """Constructor method."""
-
- self.path_results = path_results
- self.path_results_db = os.path.join(self.path_results, "metaboblend_results.sqlite")
- self.msn = msn
-
- self.conn = None
- self.cursor = None
-
- self.substructure_combo_id = 0
-
- def connect(self):
- """Connects to the results database."""
-
- self.conn = sqlite3.connect(self.path_results_db)
- self.cursor = self.conn.cursor()
-
- def create_results_db(self):
- """Generates a new results database."""
-
- if os.path.exists(self.path_results_db):
- os.remove(self.path_results_db)
-
- self.connect()
-
- self.cursor.execute("""CREATE TABLE queries (
- ms_id_num INTEGER PRIMARY KEY,
- ms_id TEXT,
- exact_mass NUMERIC,
- C INTEGER,
- H INTEGER,
- N INTEGER,
- O INTEGER,
- P INTEGER,
- S INTEGER,
- ppm INTEGER,
- ha_min INTEGER,
- ha_max INTEGER,
- max_atoms_available INTEGER,
- max_degree INTEGER,
- max_n_substructures INTEGER,
- hydrogenation_allowance INTEGER,
- isomeric_smiles INTEGER)""")
-
- if self.msn:
- self.cursor.execute("""CREATE TABLE spectra (
- ms_id_num INTEGER,
- fragment_id INTEGER,
- neutral_mass NUMERIC,
- PRIMARY KEY (ms_id_num, fragment_id))""")
-
- self.cursor.execute("""CREATE TABLE structures (
- ms_id_num INTEGER,
- structure_smiles TEXT,
- frequency INTEGER,
- PRIMARY KEY (ms_id_num, structure_smiles))""")
-
- self.cursor.execute("""CREATE TABLE substructures (
- substructure_combo_id INTEGER,
- substructure_position_id INTEGER,
- ms_id_num INTEGER,
- structure_smiles TEXT,
- fragment_id INTEGER,
- substructure_smiles TEXT,
- bde INTEGER,
- PRIMARY KEY (substructure_combo_id, substructure_position_id))""")
-
- self.cursor.execute("""CREATE TABLE results (
- ms_id_num INTEGER,
- fragment_id INTEGER,
- structure_smiles TEXT,
- bde INTEGER,
- PRIMARY KEY(ms_id_num, fragment_id, structure_smiles))""")
-
- self.conn.commit()
-
- def add_ms(self, msn_data, ms_id, ms_id_num, parameters):
- """
- Add entries to the `queries` and `spectra` tables.
-
- :param msn_data: Dictionary in the form
- `msn_data[id] = {mf: [C, H, N, O, P, S], exact_mass: float, fragment_masses: []}`. id represents a unique
- identifier for a given spectral tree or fragmentation spectrum, mf is a list of integers referring to the
- molecular formula of the structure of interest, exact_mass is the mass of this molecular formula to >=4d.p.
- and fragment_masses are neutral fragment masses generated by this structure used to inform candidate
- scoring. See :py:meth:`metaboblend.build_structures.annotate_msn`.
-
- :param ms_id: Unique identifier for the annotation of a single metabolite.
-
- :param ms_id_num: Unique numeric identifier for the annotation of a single metaoblite.
-
- :param parameters: List of parameters, in the form: [ppm, ha_min, ha_max, max_atoms_available, max_degree,
- max_n_substructures, hydrogenation_allowance, isomeric_smiles]. See
- :py:meth:`metaboblend.build_structures.annotate_msn`.
- """
-
- for i, parameter in enumerate(parameters):
- if parameter is None:
- parameters[i] = "NULL"
- elif isinstance(parameter, bool):
- parameters[i] = int(parameter)
-
- self.cursor.execute("""INSERT INTO queries (
- ms_id,
- ms_id_num,
- exact_mass,
- C, H, N, O, P, S,
- ppm,
- ha_min,
- ha_max,
- max_atoms_available,
- max_degree,
- max_n_substructures,
- hydrogenation_allowance,
- isomeric_smiles
- ) VALUES ('{}', {}, {}, '{}', '{}', '{}', '{}', '{}', '{}', {})""".format(
- ms_id,
- ms_id_num,
- msn_data[ms_id]["exact_mass"],
- msn_data[ms_id]["mf"][0], msn_data[ms_id]["mf"][1],
- msn_data[ms_id]["mf"][2], msn_data[ms_id]["mf"][3],
- msn_data[ms_id]["mf"][4], msn_data[ms_id]["mf"][5],
- ", ".join([str(p) for p in parameters])
- ))
-
- self.conn.commit()
-
- def add_results(self, ms_id_num, smi_dict, fragment_mass=None, fragment_id=None, retain_substructures=False):
- """
- Record which smiles were generated for a given fragment mass.
-
- :param ms_id_num: Unique identifier for the annotation of a single metabolite.
-
- :param smi_dict: The fragment and substructure smiles generated by the annotation of a single peak for a single
- metabolite.
-
- :param fragment_mass: The neutral fragment mass that has been annotated.
-
- :param fragment_id: The unique identifier for the fragment mass that has been annotated.
-
- :param retain_substructures: If True, record substructures in the results DB.
- """
-
- if self.msn:
- self.cursor.execute("""INSERT OR IGNORE INTO spectra (
- ms_id_num,
- fragment_id,
- neutral_mass
- ) VALUES ('{}', {}, {})""".format(
- ms_id_num,
- fragment_id,
- fragment_mass
- ))
- else:
- fragment_id = "NULL"
-
- for structure_smiles in smi_dict.keys():
-
- self.cursor.execute("""INSERT OR IGNORE INTO results (
- ms_id_num,
- fragment_id,
- structure_smiles,
- bde
- ) VALUES ({}, {}, '{}', {})""".format(
- ms_id_num,
- fragment_id,
- structure_smiles,
- min(smi_dict[structure_smiles]["bdes"])
- ))
-
- if retain_substructures:
- for i in range(len(smi_dict[structure_smiles]["substructures"])): # for each combination
-
- for j, substructure in enumerate(smi_dict[structure_smiles]["substructures"][i]):
-
- self.cursor.execute("""INSERT INTO substructures (
- substructure_combo_id,
- substructure_position_id,
- ms_id_num,
- fragment_id,
- structure_smiles,
- substructure_smiles,
- bde
- ) VALUES ({}, {}, {}, {}, '{}', '{}', {})""".format(
- self.substructure_combo_id,
- j,
- ms_id_num,
- fragment_id,
- structure_smiles,
- substructure,
- smi_dict[structure_smiles]["bdes"][i]
- ))
-
- self.substructure_combo_id += 1
-
- self.conn.commit()
-
- def calculate_frequencies(self, ms_id_num):
- """
- Calculates structure frequencies in the SQLite DB.
-
- :param ms_id_num: Unique identifier for the annotation of a single metabolite.
- """
-
- self.cursor.execute("""INSERT INTO structures (ms_id_num, structure_smiles, frequency)
- SELECT ms_id_num, structure_smiles, COUNT(*)
- FROM results
- WHERE ms_id_num = {}
- GROUP BY structure_smiles""".format(ms_id_num))
-
- def get_structures(self, ms_id_num):
- """
- Gets smiles of generated structures. In the case of the MSn annotation workflow, also gets structure
- frequencies.
-
- :param ms_id_num: Unique identifier for the annotation of a single metabolite.
-
- :return: In the case of simple structure generation, returns a set of smiles strings for output structures.
- For the MSn annotation workflow, returns a dictionary with smiles as keys and the number of peaks for which
- the smiles were generated as values.
- """
-
- if self.msn:
- msn_str = ", frequency"
- else:
- msn_str = ""
-
- self.cursor.execute("""SELECT structure_smiles{} FROM structures
- WHERE ms_id_num = {}
- """.format(msn_str, ms_id_num))
-
- if self.msn:
- return [t for t in self.cursor.fetchall()]
- else:
- return [item for t in self.cursor.fetchall() for item in t]
-
- def generate_csv_output(self):
- """
- Generate CSV file output for i) queries and tool parameters and ii) structures generated.
- """
-
- with open(os.path.join(self.path_results, "metaboblend_queries.csv"), "w", newline="") as results_file, \
- open(os.path.join(self.path_results, "metaboblend_structures.csv"), "w", newline="") as ms_file:
-
- results_writer = csv.writer(results_file, delimiter=",")
- ms_writer = csv.writer(ms_file, delimiter=",")
-
- results_writer.writerow(["ms_id", "exact_mass", "C", "H", "N", "O", "P", "S", "ppm", "ha_min", "ha_max",
- "max_atoms_available", "max_degree", "max_n_substructures",
- "hydrogenation_allowance", "isomeric_smiles"])
-
- self.cursor.execute("SELECT * FROM queries")
-
- for query in self.cursor.fetchall():
- results_writer.writerow(query)
-
- ms_writer.writerow(["ms_id", "smiles", "frequency", "exact_mass", "C", "H", "N", "O", "P", "S"])
-
- self.cursor.execute("SELECT * FROM structures")
-
- for structure in self.cursor.fetchall():
- ms_writer.writerow(structure)
-
- def close(self):
- """Close the connection to the SQLITE3 database."""
-
- self.conn.close()
-
-
-def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
+def annotate_msn(msn_data: Union[str, os.PathLike, Dict[str, Dict[str, Union[int, list]]]],
path_substructure_db: Union[str, bytes, os.PathLike] = os.path.realpath(os.getcwd()),
path_out: Union[str, bytes, os.PathLike] = "",
ppm: int = 5,
@@ -590,11 +234,18 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
text format. For the generation of structures without MSn data, see
:py:meth:`metaboblend.build_structures.generate_structures`.
- :param msn_data: Dictionary in the form
- `msn_data[id] = {mf: [C, H, N, O, P, S], exact_mass: float, fragment_masses=[]}`. id represents a unique
- identifier for a given spectral tree or fragmentation spectrum, mf is a list of integers referring to the
- molecular formula of the structure of interest, exact_mass is the mass of this molecular formula to >=4d.p.
- and fragment_masses are neutral fragment masses generated by this structure used to inform candidate scoring.
+ :param msn_data: Either a dictionary or the path to an MSP file. MSP files are parsed by
+ :py:meth:`metaboblend.parse.parse_ms_data` before being converted into a dictionary. If a dictionary is
+ provided, it must contain one item per fragmentation spectrum; the keys of the dictionary should be a unique ID
+ for the query and the corresponding value must itself be a dictionary, containing the following:
+
+ - "exact_mass": `float` (neutral mass of query) OR "precursor_mz": `float` (mz of precursor ion)
+ - "mf": `[C, H, N, O, P, S]` (a list of 6 integers)
+ - "neutral_fragment_masses": `[float, float, ...]` (list of neutral fragment masses) OR "fragment_mzs":
+ `[float, float, ...]` (list of fragment mzs)
+ - "precursor_type": `str` (e.g. "[M+H]+", required for calculating neutral masses from ion mzs)
+
+ The dictionary or MSP path is fed to :py:meth:`metaboverse.parse.parse_ms_data`.
:param path_substructure_db: The path to the SQLite 3 substructure database, as generated by
:py:meth:`metaboblend.databases.SubstructureDb`.
@@ -671,22 +322,25 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
max_degree=max_degree,
max_atoms_available=max_atoms_available,
minimum_frequency=minimum_frequency,
- max_mass=round(max([msn_data[ms_id]["exact_mass"] for ms_id in msn_data.keys()]))
+ max_mass=None
)
- for i, ms_id in enumerate(msn_data.keys()):
+ for i, ms in enumerate(parse_ms_data(msn_data)):
+
+ if ms is None:
+ continue
- results_db.add_ms(msn_data, ms_id, i,
+ results_db.add_ms(msn_data, ms["ms_id"], i,
[ppm, ha_min, ha_max, max_atoms_available, max_degree, max_n_substructures, hydrogenation_allowance, isomeric_smiles])
- for j, fragment_mass in enumerate(msn_data[ms_id]["fragment_masses"]):
+ for j, fragment_mass in enumerate(ms["neutral_fragment_masses"]):
for k in range(0 - hydrogenation_allowance, hydrogenation_allowance + 1):
hydrogenated_fragment_mass = fragment_mass + (k * 1.007825) # consider re-arrangements
smi_dict = build(
- mf=msn_data[ms_id]["mf"],
- exact_mass=msn_data[ms_id]["exact_mass"],
+ mf=ms["mf"],
+ exact_mass=ms["exact_mass"],
max_n_substructures=max_n_substructures,
path_connectivity_db=path_connectivity_db,
path_substructure_db=path_substructure_db,
@@ -705,7 +359,7 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
results_db.calculate_frequencies(i)
if yield_smis:
- yield {ms_id: results_db.get_structures(i)}
+ yield {ms["ms_id"]: results_db.get_structures(i)}
if write_csv_output:
results_db.generate_csv_output()
@@ -714,7 +368,7 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]],
results_db.close()
-def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
+def generate_structures(ms_data: Union[str, os.PathLike, Dict[str, Dict[str, Union[int, None]]]],
path_substructure_db: Union[str, bytes, os.PathLike],
path_out: Union[str, bytes, os.PathLike] = os.path.realpath(os.getcwd()),
ha_min: Union[int, None] = 2,
@@ -736,11 +390,17 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
text format. For the generation of structures from MSn data, see
:py:meth:`metaboblend.build_structures.annotate_msn`.
- :param ms_data: Dictionary in the form ms_data[id] =
- `{mf: [C, H, N, O, P, S], exact_mass: float, prescribed_mass=int}`. id represents a unique identifier for
- a given test, mf is a list of integers referring to molecular formula of the structure of interest,
- exact_mass is the mass of this structure to >=4d.p. and prescribed_mass is the neutral mass of a substructure
- used to limit structures generated.
+ :param ms_data: A dictionary that must contain one item per fragmentation spectrum; the keys of the dictionary
+ should be a unique ID for the query and the corresponding value must itself be a dictionary, containing the
+ following:
+
+ - "exact_mass": `float` (neutral mass of query) OR "precursor_mz": `float` (mz of precursor ion)
+ - "mf": `[C, H, N, O, P, S]` (a list of 6 integers)
+ - "precursor_type": `str` (e.g. "[M+H]+", required for calculating neutral masses from ion mzs)
+ - (optional) "prescribed_mass": 'float' (neutral mass of substructure).
+
+ The dictionary or MSP path is fed to :py:meth:`metaboverse.parse.parse_ms_data`. A single neutral substructure
+ mass may be provided ("prescribed_mass") to guide the structure generation process.
:param path_substructure_db: The path to the SQLite 3 substructure database, as generated by
:py:meth:`metaboblend.databases.SubstructureDb`.
@@ -788,6 +448,8 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
:param write_csv_output: Whether to extract results from the SQLite3 database for deposition in CSV files.
+ :param retain_substructures: Whether to record the substructures used to generate final structures.
+
:return: For each input molecule, yields unique SMILEs strings (unless `yield_smis = False`).
"""
@@ -809,26 +471,26 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
max_mass=round(max([ms_data[ms_id]["exact_mass"] for ms_id in ms_data.keys()]))
)
- for i, ms_id in enumerate(ms_data.keys()):
+ for i, ms in enumerate(parse_ms_data(ms_data, False)):
- results_db.add_ms(ms_data, ms_id, i,
+ results_db.add_ms(ms_data, ms["ms_id"], i,
[None, ha_min, ha_max, max_atoms_available, max_degree, max_n_substructures, None, isomeric_smiles])
ppm = None
try:
- if ms_data[ms_id]["prescribed_masses"] is not None:
+ if ms["prescribed_mass"] is not None:
ppm = 0
except KeyError:
- ms_data[ms_id]["prescribed_masses"] = None
+ ms["prescribed_mass"] = None
smi_dict = build(
- mf=ms_data[ms_id]["mf"],
- exact_mass=ms_data[ms_id]["exact_mass"],
+ mf=ms["mf"],
+ exact_mass=ms["exact_mass"],
max_n_substructures=max_n_substructures,
path_connectivity_db=path_connectivity_db,
path_substructure_db=path_substructure_db,
- prescribed_mass=ms_data[ms_id]["prescribed_masses"],
+ prescribed_mass=ms["prescribed_mass"],
ppm=ppm,
table_name=table_name,
ncpus=ncpus,
@@ -837,13 +499,13 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]],
retain_substructures=retain_substructures
)
- results_db.add_results(i, smi_dict, ms_data[ms_id]["prescribed_masses"])
+ results_db.add_results(i, smi_dict, ms["prescribed_mass"])
smi_dict = None
results_db.calculate_frequencies(i)
if yield_smis:
- yield {ms_id: results_db.get_structures(i)}
+ yield {ms["ms_id"]: results_db.get_structures(i)}
if write_csv_output:
results_db.generate_csv_output()
@@ -1040,15 +702,21 @@ def gen_subs_table(db, ha_min, ha_max, max_degree, max_atoms_available, max_mass
ha_max_statement = """
AND heavy_atoms <= %s""" % str(ha_max)
+ if max_mass is None:
+ max_mass_statment = ""
+ else:
+ max_mass_statment = """
+ AND exact_mass__1 < %s""" % str(max_mass)
+
db.cursor.execute("""CREATE TABLE {} AS
- SELECT * FROM substructures WHERE
- atoms_available <= {} AND
- valence <= {} AND
- exact_mass__1 < {}{}{}{}
+ SELECT *
+ FROM substructures
+ WHERE atoms_available <= {}
+ AND valence <= {}{}{}{}{}
""".format(table_name,
max_atoms_available,
max_degree,
- max_mass,
+ max_mass_statment,
freq_statement,
ha_min_statement,
ha_max_statement))
@@ -1136,6 +804,11 @@ def substructure_combination_build(substructure_subset, configs_iso, prescribed_
:param isomeric_smiles: True/False, should output smiles be written with isomeric information?
+ :param bond_enthalpies: Dictionary of bond enthalpies, as generated by
+ :py:meth:`metaboblend.build_structures.get_bond_enthalpies`.
+
+ :param retain_substructures: Whether to record the substructures used to generate final structures.
+
:return: List of smiles representing molecules generated (and the substructures used to generate them).
"""
diff --git a/metaboblend/databases.py b/metaboblend/databases.py
index 03d24a8..2c31d12 100644
--- a/metaboblend/databases.py
+++ b/metaboblend/databases.py
@@ -21,13 +21,10 @@
import io
import os
-import sys
-import subprocess
+import pickle
import sqlite3
import tempfile
-import pickle
-from collections import OrderedDict
-import xml.etree.ElementTree as ElementTree
+import subprocess
import networkx as nx
from typing import Sequence, Dict, Union
@@ -35,97 +32,10 @@
from rdkit.Chem import Recap
from rdkit.Chem import BRICS
+from .parse import parse_xml
from .auxiliary import calculate_complete_multipartite_graphs, graph_to_ri, graph_info, sort_subgraphs
-def reformat_xml(source, encoding="utf8"):
- """
- Reformats HMDB xml files to be compatible with :py:meth:`metaboblend.databases.parse_xml`; some such files do not
- contain a `` header.
-
- :param source: Path to file to be reformatted.
-
- :param encoding: Encoding of source file.
-
- :return: Source file destination.
- """
-
- with io.open(source, "r", encoding=encoding) as xml:
- xml_contents = xml.readlines()
- if "hmdb" in xml_contents[1]:
- return source
-
- xml_contents.insert(1, " \n")
-
- with io.open(source, "w", encoding=encoding) as xml:
- xml_contents = "".join(xml_contents)
- xml.write(xml_contents)
- xml.write("")
-
- return source
-
-
-def parse_xml(source, encoding="utf8", reformat=False):
- """
- Parses the contents of HMDB xml files to to extract information for the generation of substructures.
-
- :param source: Source file destination.
-
- :param encoding: Encoding of source file.
-
- :param reformat: Whether to apply :py:meth:`metaboblend.databases.reformat_xml` to the XML file. Is required for
- XML files recording single metabolites.
-
- * **True** Add a `` header to the XML file before parsing.
-
- * **False** Parse the XML file as it is (recommended if header is present).
-
- :return: The XML file converted to a dictionary.
- """
-
- if reformat:
- reformat_xml(source, encoding)
-
- with io.open(source, "r", encoding=encoding) as inp:
- record_out = OrderedDict()
-
- inp.readline()
- inp.readline()
-
- xml_record = ""
- path = []
-
- for line in inp:
- xml_record += line
- if line == "\n" or line == "\n":
-
- if sys.version_info[0] == 3:
- inp = io.StringIO(xml_record)
- else:
- inp = io.BytesIO(xml_record.encode('utf-8').strip())
-
- for event, elem in ElementTree.iterparse(inp, events=("start", "end")):
- if event == 'end':
- path.pop()
-
- if event == 'start':
- path.append(elem.tag)
- if elem.text is not None:
- if elem.text.replace(" ", "") != "\n":
-
- path_elem = ".".join(map(str, path[1:]))
- if path_elem in record_out:
- if type(record_out[path_elem]) != list:
- record_out[path_elem] = [record_out[path_elem]]
- record_out[path_elem].append(elem.text)
- else:
- record_out[path_elem] = elem.text
-
- xml_record = ""
- yield record_out
- record_out = OrderedDict()
-
-
class SubstructureDb:
"""
Methods for interacting with the SQLITE3 substructure and connectivity databases. Provides a connection to the
diff --git a/metaboblend/parse.py b/metaboblend/parse.py
new file mode 100644
index 0000000..5495a58
--- /dev/null
+++ b/metaboblend/parse.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019-2020 Ralf Weber
+#
+# This file is part of MetaboBlend.
+#
+# MetaboBlend is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# MetaboBlend is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with MetaboBlend. If not, see .
+#
+
+import io
+import re
+import sys
+import copy
+import warnings
+from collections import OrderedDict
+import xml.etree.ElementTree as ElementTree
+
+
+def parse_ms_data(ms_data, msn=True):
+ """
+ Parse raw data provided by user and yield formatted input data. Decides what type of data has been provided
+ (i.e. whether a dictionary has been given vs path to MSP file; if a dictionary, checks whether neutral masses
+ need to be calculated from precursor ions).
+
+ :param ms_data: Dictionary containing input data or path to an MSP file.
+
+ :param msn: If True, formats the data for use by :py:meth:`metaboblend.build_structures.annotate_msn`; else, formats
+ input data for use by :py:meth:`metaboblend.build_structures.generate_structures`. Only relevant if a
+ dictionary has been provided.
+
+ :return: Yields a dictionary for use by build functions to generate structures.
+ """
+
+ if isinstance(ms_data, dict):
+ for i, ms_id in enumerate(ms_data.keys()):
+
+ ms_data[ms_id]["ms_id"] = ms_id
+
+ # check if user has provided a neutralised mass or ionised mz values
+ if "neutral_fragment_masses" in ms_data[ms_id].keys() and "exact_mass" in ms_data[ms_id].keys():
+ which = "none"
+
+ elif "exact_mass" in ms_data[ms_id].keys():
+ if msn:
+ which = "fragments"
+ else:
+ which = "none"
+
+ elif "neutral_fragment_masses" in ms_data[ms_id].keys() or not msn:
+ which = "precursor"
+
+ else:
+ which = "both"
+
+ yield precursor_ions_to_neutral_masses(ms_data[ms_id], which)
+
+ else:
+ yield from parse_msp(ms_data)
+
+
+def precursor_ion_to_neutral_mass(mass, precursor_type):
+ """
+ Convert precursor ion to predicted neutral mass for substructure searching.
+
+ :param mass: Charged mass to be neutralised.
+
+ :param precursor_type: Type of precursor ion.
+
+ :return: Neutral mass.
+ """
+
+ # conversions
+ precursor_dict = {"[M+H]+": 1.007276,
+ "[M+Na]+": 22.989221,
+ "[M+K]+": 38.963158,
+ "[M-H]-": -1.007276,
+ "[M+Cl]-": 34.969401,
+ "[M+Na-2H]-": 20.974668,
+ "[M+K-2H]-": 36.948605,
+ "[M+Hac-H]-": 59.013853}
+
+ return mass - precursor_dict[precursor_type]
+
+
+def precursor_ions_to_neutral_masses(ms_dict, which="both"):
+ """
+ Convert precursor ion and fragment ions to neutral.
+
+ :param ms_dict: Dictionary used by build functions to generate structures. Converts the precursor ion mass and/or
+ the fragment ions to their respective neutral masses.
+
+ :param which: Whether to convert the precursor ion ("precursor"), the fragment ions ("fragments") or both ("both")
+ to their respective neutral masses. If which is "none", returns the original dictionary.
+
+ :return: Returns `ms_dict` with additional items corresponding to neutralised masses.
+ """
+
+ if which == "precursor" or which == "both":
+ ms_dict["exact_mass"] = precursor_ion_to_neutral_mass(ms_dict["precursor_mz"],
+ ms_dict["precursor_type"])
+
+ if which == "fragments" or which == "both":
+
+ ms_dict["neutral_fragment_masses"] = []
+
+ for fragment_ion_mass in ms_dict["fragment_mzs"]:
+ ms_dict["neutral_fragment_masses"].append(precursor_ion_to_neutral_mass(fragment_ion_mass,
+ ms_dict["precursor_type"]))
+
+ return ms_dict
+
+
+def parse_msp(msp_path):
+ """
+ Parse msp files and yield data for each compound. Accepts MSP files in MoNa or MassBank format. We expect that
+ the following are provided in the MSP:
+
+ - A unique accession ID.
+ - The molecular formula of the compound.
+ - The precursor mz representing the mass of the charged precursor ion.
+ - Fragment mzs representing masses of charged fragment ions.
+ - The type of precursor, e.g. "[M+H]+".
+
+ Code adapted from `msp2db` (https://github.com/computational-metabolomics/msp2db/blob/master/msp2db/parse.py).
+
+ :param msp_path: Path of an MSP file to be converted into a dictionary.
+
+ :return: Dictionary in a form useable by :py:meth:`metaboblend.build_structures.annotate_msn` and
+ :py:meth:`metaboblend.build_structures.generate_structures`.
+ """
+
+ meta_parse = get_msp_regex()
+ reached_spectra = False
+
+ empty_dict = {"ms_id": None, "mf": None, "precursor_mz": None, "precursor_type": None, "fragment_mzs": []}
+ entry_dict = copy.deepcopy(empty_dict)
+
+ with open(msp_path, "r") as msp_file:
+
+ for line in msp_file:
+
+ line = re.sub('^(.{2}\\$)', "", line) # remove "XX$" from line start in massbank files
+
+ if reached_spectra:
+ if line in ["\n", "\r\n", "//\n", "//\r\n", "", "//"]: # reached end of spectra
+
+ yield reformat_msp_input(entry_dict) # completed entry ready for sending to build
+
+ entry_dict = copy.deepcopy(empty_dict)
+ reached_spectra = False
+
+ else: # add peak
+ entry_dict["fragment_mzs"].append(float(line.split()[0]))
+
+ else:
+ for meta_type in meta_parse.keys():
+ for meta_re in meta_parse[meta_type]:
+
+ re_query = re.search(meta_re, line, re.IGNORECASE)
+
+ if re_query: # TODO: walrus
+ entry_dict[meta_type] = re_query.group(1).strip()
+
+ if re.match("^Num Peaks(.*)$", line, re.IGNORECASE) or re.match("^PEAK:(.*)", line, re.IGNORECASE):
+ reached_spectra = True # reached line prior to spectra
+
+ if entry_dict != empty_dict:
+ yield reformat_msp_input(entry_dict)
+
+
+def reformat_msp_input(entry_dict):
+ """
+ Reformat input for use by build functions.
+
+ :param entry_dict: Dictionary containing MSn information extracted from an MSP file (by
+ :py:meth:`metaboblend.parse.parse_msp`. The dictionary must contain the following:
+
+ - ms_id - a unique accession number
+ - mf - the molecular formula of the compound (in the format "CXHXNXOXPXSX")
+ - precursor_mz - mz representing the mass of the charged precursor ion
+ - precursor_type - the type of precursor ion (e.g. "[M+H]+")
+ - fragment_mzs - mz(s) representing the mass of charged fragment ions
+
+ :return: If the correct inputs were not provided in the MSP (and, hence, were not available in `entry_dict`),
+ returns None (and generates a warning with i) the accession (if available) and ii) the variable that was not
+ able to be extracted from the MSP). Else, returns the same dictionary after reformatting the molecular formula,
+ using :py:meth:`metaboblend.parse.mc_to_list`, and converting the precursor ions to their corresponding
+ neutral masses.
+ """
+
+ if entry_dict["mf"] is not None: # convert from C5H6... to [5, 6, ...]
+ entry_dict["mf"] = mc_to_list(entry_dict["mf"])
+
+ for key in ["ms_id", "mf", "precursor_mz", "precursor_type"]: # required for the tool to function
+ if entry_dict[key] is None:
+ if key == "ms_id":
+ warnings.warn("Entry ignored from MSP file due to lack of accession in MSP file")
+ else:
+ warnings.warn("Entry " + entry_dict["ms_id"] + " removed due to lack of valid " + key + " in MSP file")
+ return None
+
+ entry_dict["precursor_mz"] = float(entry_dict["precursor_mz"])
+
+ if len(entry_dict["fragment_mzs"]) == 0: # require a spectra to annotate
+ warnings.warn("No fragments were identified for " + entry_dict["ms_id"] + " in MSP file")
+ return None
+
+ return precursor_ions_to_neutral_masses(entry_dict)
+
+
+def mc_to_list(mc):
+ """
+ Convert molecular formula string to list format.
+
+ :param mc: Molecular formula (in the format "C1H2N3O4P5S6")
+
+ :return: Molecular formula (in the format `[1, 2, 3, 4, 5, 6]`)
+ """
+
+ if isinstance(mc, list):
+ return mc
+
+ mc_list = [0, 0, 0, 0, 0, 0]
+ element_positions = {"C": 0, "H": 1, "N": 2, "O": 3, "P": 4, "S": 5}
+
+ # seperates out the formula into [letter, number, letter, number, ...]
+ mc = re.findall(r"[A-Z][a-z]*|\d+", re.sub("[A-Z][a-z]*(?![\da-z])", r"\g<0>1", mc))
+
+ for i, substring in enumerate(mc):
+
+ if i % 2 == 0: # in case of letter
+ try:
+ element_position = element_positions[substring]
+ except KeyError: # element not in C, H, N, O, P, S
+ return None
+
+ else: # record number following the letter
+ mc_list[element_position] = int(substring)
+
+ return mc_list
+
+
+def get_msp_regex():
+ """ Dictionary of regular expressions for parsing msp metadata. """
+
+ meta_parse = {"ms_id": ["^accession(?:=|:)(.*)$", "^DB#(?:=|:)(.*)$", "^ACCESSION:(.*)$"], # use accession as ms_id
+ "mf": ["^molecular formula(?:=|:)(.*)$", "^formula:(.*)$"],
+ "precursor_type": ["^precursor.*type(?:=|:)(.*)$", "^adduct(?:=|:)(.*)$", "^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$"],
+ "precursor_mz": ["^precursor m/z(?:=|:)\s*(\d*[.,]?\d*)$", "^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$", "^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$"]}
+
+ return meta_parse
+
+
+def reformat_xml(source, encoding="utf8"):
+ """
+ Reformats HMDB xml files to be compatible with :py:meth:`metaboblend.databases.parse_xml`; some such files do not
+ contain a `` header.
+
+ :param source: Path to file to be reformatted.
+
+ :param encoding: Encoding of source file.
+
+ :return: Source file destination.
+ """
+
+ with io.open(source, "r", encoding=encoding) as xml:
+ xml_contents = xml.readlines()
+ if "hmdb" in xml_contents[1]:
+ return source
+
+ xml_contents.insert(1, " \n")
+
+ with io.open(source, "w", encoding=encoding) as xml:
+ xml_contents = "".join(xml_contents)
+ xml.write(xml_contents)
+ xml.write("")
+
+ return source
+
+
+def parse_xml(source, encoding="utf8", reformat=False):
+ """
+ Parses the contents of HMDB xml files to to extract information for the generation of substructures.
+
+ :param source: Source file destination.
+
+ :param encoding: Encoding of source file.
+
+ :param reformat: Whether to apply :py:meth:`metaboblend.databases.reformat_xml` to the XML file. Is required for
+ XML files recording single metabolites.
+
+ * **True** Add a `` header to the XML file before parsing.
+
+ * **False** Parse the XML file as it is (recommended if header is present).
+
+ :return: The XML file converted to a dictionary.
+ """
+
+ if reformat:
+ reformat_xml(source, encoding)
+
+ with io.open(source, "r", encoding=encoding) as inp:
+ record_out = OrderedDict()
+
+ inp.readline()
+ inp.readline()
+
+ xml_record = ""
+ path = []
+
+ for line in inp:
+ xml_record += line
+ if line == "\n" or line == "\n":
+
+ if sys.version_info[0] == 3:
+ inp = io.StringIO(xml_record)
+ else:
+ inp = io.BytesIO(xml_record.encode('utf-8').strip())
+
+ for event, elem in ElementTree.iterparse(inp, events=("start", "end")):
+ if event == 'end':
+ path.pop()
+
+ if event == 'start':
+ path.append(elem.tag)
+ if elem.text is not None:
+ if elem.text.replace(" ", "") != "\n":
+
+ path_elem = ".".join(map(str, path[1:]))
+ if path_elem in record_out:
+ if type(record_out[path_elem]) != list:
+ record_out[path_elem] = [record_out[path_elem]]
+ record_out[path_elem].append(elem.text)
+ else:
+ record_out[path_elem] = elem.text
+
+ xml_record = ""
+ yield record_out
+ record_out = OrderedDict()
diff --git a/metaboblend/results.py b/metaboblend/results.py
new file mode 100644
index 0000000..898443f
--- /dev/null
+++ b/metaboblend/results.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019-2020 Ralf Weber
+#
+# This file is part of MetaboBlend.
+#
+# MetaboBlend is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# MetaboBlend is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with MetaboBlend. If not, see .
+#
+
+import os
+import csv
+import sqlite3
+
+
+class ResultsDb:
+ """
+ Methods for interacting with the SQLITE3 results database, as created by
+ :py:meth:`metaboblend.build_structures.annotate_msn`.
+
+ :param path_results: Directory to which results will be written.
+ """
+
+ def __init__(self, path_results, msn=True):
+ """Constructor method."""
+
+ self.path_results = path_results
+ self.path_results_db = os.path.join(self.path_results, "metaboblend_results.sqlite")
+ self.msn = msn
+
+ self.conn = None
+ self.cursor = None
+
+ self.substructure_combo_id = 0
+
+ def connect(self):
+ """ Connects to the results database. """
+
+ self.conn = sqlite3.connect(self.path_results_db)
+ self.cursor = self.conn.cursor()
+
+ def create_results_db(self):
+ """ Generates a new results database. """
+
+ if os.path.exists(self.path_results_db):
+ os.remove(self.path_results_db)
+
+ self.connect()
+
+ self.cursor.execute("""CREATE TABLE queries (
+ ms_id_num INTEGER PRIMARY KEY,
+ ms_id TEXT,
+ exact_mass NUMERIC,
+ C INTEGER,
+ H INTEGER,
+ N INTEGER,
+ O INTEGER,
+ P INTEGER,
+ S INTEGER,
+ ppm INTEGER,
+ ha_min INTEGER,
+ ha_max INTEGER,
+ max_atoms_available INTEGER,
+ max_degree INTEGER,
+ max_n_substructures INTEGER,
+ hydrogenation_allowance INTEGER,
+ isomeric_smiles INTEGER)""")
+
+ if self.msn:
+ self.cursor.execute("""CREATE TABLE spectra (
+ ms_id_num INTEGER,
+ fragment_id INTEGER,
+ neutral_mass NUMERIC,
+ PRIMARY KEY (ms_id_num, fragment_id))""")
+
+ self.cursor.execute("""CREATE TABLE structures (
+ ms_id_num INTEGER,
+ structure_smiles TEXT,
+ frequency INTEGER,
+ PRIMARY KEY (ms_id_num, structure_smiles))""")
+
+ self.cursor.execute("""CREATE TABLE substructures (
+ substructure_combo_id INTEGER,
+ substructure_position_id INTEGER,
+ ms_id_num INTEGER,
+ structure_smiles TEXT,
+ fragment_id INTEGER,
+ substructure_smiles TEXT,
+ bde INTEGER,
+ PRIMARY KEY (substructure_combo_id, substructure_position_id))""")
+
+ self.cursor.execute("""CREATE TABLE results (
+ ms_id_num INTEGER,
+ fragment_id INTEGER,
+ structure_smiles TEXT,
+ bde INTEGER,
+ PRIMARY KEY(ms_id_num, fragment_id, structure_smiles))""")
+
+ self.conn.commit()
+
+ def add_ms(self, msn_data, ms_id, ms_id_num, parameters):
+ """
+ Add entries to the `queries` and `spectra` tables.
+
+ :param msn_data: Dictionary in the form
+ `msn_data[id] = {mf: [C, H, N, O, P, S], exact_mass: float, fragment_masses: []}`. id represents a unique
+ identifier for a given spectral tree or fragmentation spectrum, mf is a list of integers referring to the
+ molecular formula of the structure of interest, exact_mass is the mass of this molecular formula to >=4d.p.
+ and fragment_masses are neutral fragment masses generated by this structure used to inform candidate
+ scoring. See :py:meth:`metaboblend.build_structures.annotate_msn`.
+
+ :param ms_id: Unique identifier for the annotation of a single metabolite.
+
+ :param ms_id_num: Unique numeric identifier for the annotation of a single metaoblite.
+
+ :param parameters: List of parameters, in the form: [ppm, ha_min, ha_max, max_atoms_available, max_degree,
+ max_n_substructures, hydrogenation_allowance, isomeric_smiles]. See
+ :py:meth:`metaboblend.build_structures.annotate_msn`.
+ """
+
+ for i, parameter in enumerate(parameters):
+ if parameter is None:
+ parameters[i] = "NULL"
+ elif isinstance(parameter, bool):
+ parameters[i] = int(parameter)
+
+ self.cursor.execute("""INSERT INTO queries (
+ ms_id,
+ ms_id_num,
+ exact_mass,
+ C, H, N, O, P, S,
+ ppm,
+ ha_min,
+ ha_max,
+ max_atoms_available,
+ max_degree,
+ max_n_substructures,
+ hydrogenation_allowance,
+ isomeric_smiles
+ ) VALUES ('{}', {}, {}, '{}', '{}', '{}', '{}', '{}', '{}', {})""".format(
+ ms_id,
+ ms_id_num,
+ msn_data[ms_id]["exact_mass"],
+ msn_data[ms_id]["mf"][0], msn_data[ms_id]["mf"][1],
+ msn_data[ms_id]["mf"][2], msn_data[ms_id]["mf"][3],
+ msn_data[ms_id]["mf"][4], msn_data[ms_id]["mf"][5],
+ ", ".join([str(p) for p in parameters])
+ ))
+
+ self.conn.commit()
+
+ def add_results(self, ms_id_num, smi_dict, fragment_mass=None, fragment_id=None, retain_substructures=False):
+ """
+ Record which smiles were generated for a given fragment mass.
+
+ :param ms_id_num: Unique identifier for the annotation of a single metabolite.
+
+ :param smi_dict: The fragment and substructure smiles generated by the annotation of a single peak for a single
+ metabolite.
+
+ :param fragment_mass: The neutral fragment mass that has been annotated.
+
+ :param fragment_id: The unique identifier for the fragment mass that has been annotated.
+
+ :param retain_substructures: If True, record substructures in the results DB.
+ """
+
+ if self.msn:
+ self.cursor.execute("""INSERT OR IGNORE INTO spectra (
+ ms_id_num,
+ fragment_id,
+ neutral_mass
+ ) VALUES ('{}', {}, {})""".format(
+ ms_id_num,
+ fragment_id,
+ fragment_mass
+ ))
+ else:
+ fragment_id = "NULL"
+
+ for structure_smiles in smi_dict.keys():
+
+ self.cursor.execute("""INSERT OR IGNORE INTO results (
+ ms_id_num,
+ fragment_id,
+ structure_smiles,
+ bde
+ ) VALUES ({}, {}, '{}', {})""".format(
+ ms_id_num,
+ fragment_id,
+ structure_smiles,
+ min(smi_dict[structure_smiles]["bdes"])
+ ))
+
+ if retain_substructures:
+ for i in range(len(smi_dict[structure_smiles]["substructures"])): # for each combination
+
+ for j, substructure in enumerate(smi_dict[structure_smiles]["substructures"][i]):
+ self.cursor.execute("""INSERT INTO substructures (
+ substructure_combo_id,
+ substructure_position_id,
+ ms_id_num,
+ fragment_id,
+ structure_smiles,
+ substructure_smiles,
+ bde
+ ) VALUES ({}, {}, {}, {}, '{}', '{}', {})""".format(
+ self.substructure_combo_id,
+ j,
+ ms_id_num,
+ fragment_id,
+ structure_smiles,
+ substructure,
+ smi_dict[structure_smiles]["bdes"][i]
+ ))
+
+ self.substructure_combo_id += 1
+
+ self.conn.commit()
+
+ def calculate_frequencies(self, ms_id_num):
+ """
+ Calculates structure frequencies in the SQLite DB.
+
+ :param ms_id_num: Unique identifier for the annotation of a single metabolite.
+ """
+
+ self.cursor.execute("""INSERT INTO structures (ms_id_num, structure_smiles, frequency)
+ SELECT ms_id_num, structure_smiles, COUNT(*)
+ FROM results
+ WHERE ms_id_num = {}
+ GROUP BY structure_smiles""".format(ms_id_num))
+
+ def get_structures(self, ms_id_num):
+ """
+ Gets smiles of generated structures. In the case of the MSn annotation workflow, also gets structure
+ frequencies.
+
+ :param ms_id_num: Unique identifier for the annotation of a single metabolite.
+
+ :return: In the case of simple structure generation, returns a set of smiles strings for output structures.
+ For the MSn annotation workflow, returns a dictionary with smiles as keys and the number of peaks for which
+ the smiles were generated as values.
+ """
+
+ if self.msn:
+ msn_str = ", frequency"
+ else:
+ msn_str = ""
+
+ self.cursor.execute("""SELECT structure_smiles{} FROM structures
+ WHERE ms_id_num = {}
+ """.format(msn_str, ms_id_num))
+
+ if self.msn:
+ return [t for t in self.cursor.fetchall()]
+ else:
+ return [item for t in self.cursor.fetchall() for item in t]
+
+ def generate_csv_output(self):
+ """ Generate CSV file output for i) queries and tool parameters and ii) structures generated. """
+
+ with open(os.path.join(self.path_results, "metaboblend_queries.csv"), "w", newline="") as results_file, \
+ open(os.path.join(self.path_results, "metaboblend_structures.csv"), "w", newline="") as ms_file:
+
+ results_writer = csv.writer(results_file, delimiter=",")
+ ms_writer = csv.writer(ms_file, delimiter=",")
+
+ results_writer.writerow(["ms_id", "exact_mass", "C", "H", "N", "O", "P", "S", "ppm", "ha_min", "ha_max",
+ "max_atoms_available", "max_degree", "max_n_substructures",
+ "hydrogenation_allowance", "isomeric_smiles"])
+
+ self.cursor.execute("SELECT * FROM queries")
+
+ for query in self.cursor.fetchall():
+ results_writer.writerow(query)
+
+ ms_writer.writerow(["ms_id", "smiles", "frequency", "exact_mass", "C", "H", "N", "O", "P", "S"])
+
+ self.cursor.execute("SELECT * FROM structures")
+
+ for structure in self.cursor.fetchall():
+ ms_writer.writerow(structure)
+
+ def close(self):
+ """ Close the connection to the SQLITE3 database. """
+
+ self.conn.close()
diff --git a/tests/test_build_structures.py b/tests/test_build_structures.py
index e74f81d..32a67c7 100644
--- a/tests/test_build_structures.py
+++ b/tests/test_build_structures.py
@@ -202,7 +202,7 @@ def test_generate_structures(self): # tests vs build
ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"],
record_dict["O"], record_dict["P"], record_dict["S"]],
"exact_mass": record_dict["exact_mass"],
- "prescribed_masses": fragments[i]}}
+ "prescribed_mass": fragments[i]}}
# test prescribed building
returned_smis = list(
@@ -280,7 +280,7 @@ def test_annotate_msn(self): # tests vs build_msn
ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"],
record_dict["O"], record_dict["P"], record_dict["S"]],
"exact_mass": record_dict["exact_mass"],
- "fragment_masses": fragments}}
+ "neutral_fragment_masses": fragments}}
# test standard building
returned_smis = list(annotate_msn(
@@ -307,7 +307,7 @@ def test_annotate_msn(self): # tests vs build_msn
ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"],
record_dict["O"], record_dict["P"], record_dict["S"]],
"exact_mass": record_dict["exact_mass"],
- "fragment_masses": fragments}
+ "neutral_fragment_masses": fragments}
os.mkdir(self.to_test_results("annotate_multi"))
@@ -338,7 +338,7 @@ def test_results_db(self):
ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"],
record_dict["O"], record_dict["P"], record_dict["S"]],
"exact_mass": record_dict["exact_mass"],
- "fragment_masses": fragments}
+ "neutral_fragment_masses": fragments}
os.mkdir(self.to_test_results("test_results_db"))
diff --git a/tests/test_data/massbank_msp.txt b/tests/test_data/massbank_msp.txt
new file mode 100644
index 0000000..cdfeb00
--- /dev/null
+++ b/tests/test_data/massbank_msp.txt
@@ -0,0 +1,87 @@
+ACCESSION: UO000002
+RECORD_TITLE: 2,3-di-O-Phytanyl-sn-glycerol-1-phosphoserine; EI-B; MS
+DATE: 2016.01.19 (Created 2009.05.29, modified 2011.05.06)
+AUTHORS: Hiroyuki Morii, Department of Chemistry, University of Occupational and Environmental Health
+LICENSE: CC BY-SA
+PUBLICATION: Morii, H., Nishihara, M., Ohga, M., and Koga, Y. 1986. A diphytanyl ether analog of phosphatidylserine from a methanogenic bacterium, Methanobrevibacter arboriphilus. J Lipid Res. 27: 724-730.
+COMMENT: Ammonium salt of the compound was analyzed
+COMMENT: [Analytical] Ionizing Curr 300 uA, Chamber Temp 250 C, Accel Volt 3KV, Ion Multi 1.0 KV,
+CH$NAME: 2,3-di-O-Phytanyl-sn-glycerol-1-phosphoserine
+CH$NAME: archaetidylserine
+CH$COMPOUND_CLASS: Glycerophospholipids; Glycerophosphoserines; Dialkylglycerophosphoserines
+CH$FORMULA: C46H94NO8P
+CH$EXACT_MASS: 819.67171
+CH$SMILES: C(CCC(C)C)C(C)CCCC(CCCC(CCOCC(OCCC(CCCC(C)CCCC(C)CCCC(C)C)C)COP(O)(=O)OCC(C(O)=O)N)C)C
+CH$IUPAC: InChI=1S/C46H94NO8P/c1-36(2)17-11-19-38(5)21-13-23-40(7)25-15-27-42(9)29-31-52-33-44(34-54-56(50,51)55-35-45(47)46(48)49)53-32-30-43(10)28-16-26-41(8)24-14-22-39(6)20-12-18-37(3)4/h36-45H,11-35,47H2,1-10H3,(H,48,49)(H,50,51)/t38-,39-,40-,41-,42-,43-,44-,45-/m1/s1
+CH$LINK: CAS 105662-26-8
+CH$LINK: LIPIDBANK EEL3026
+AC$INSTRUMENT: JMS DX-300/JMS-3500 data system, Japan Electron Optics Laboratory, Japan
+AC$INSTRUMENT_TYPE: EI-B
+AC$MASS_SPECTROMETRY: MS_TYPE MS
+AC$MASS_SPECTROMETRY: ION_MODE POSITIVE
+AC$MASS_SPECTROMETRY: IONIZATION_POTENTIAL 30 eV
+AC$MASS_SPECTROMETRY: SCANNING 5 Sec
+AC$MASS_SPECTROMETRY: SOURCE_TEMPERATURE 320 C
+MS$FOCUSED_ION: ION_TYPE [M]+*
+PK$SPLASH: splash10-05gi-9611001000-5e7663b41cf47681770e
+PK$NUM_PEAK: 58
+PK$PEAK: m/z int. rel.int.
+ 36.0 48.935 69
+ 43.0 155.642 221
+ 55.0 174.853 248
+ 56.0 241.397 343
+ 57.0 685.069 973
+ 69.0 429.724 610
+ 70.0 442.816 629
+ 71.0 703.562 999
+ 74.0 153.368 218
+ 81.0 183.718 261
+ 82.0 116.090 165
+ 83.0 432.501 614
+ 84.0 189.394 269
+ 85.0 524.513 745
+ 95.0 102.128 145
+ 96.0 179.720 255
+ 97.0 449.439 638
+ 98.0 118.287 168
+ 99.0 376.928 535
+ 111.0 429.907 610
+ 112.0 168.627 239
+ 113.0 308.186 438
+ 123.0 298.191 423
+ 124.0 233.188 331
+ 125.0 504.097 716
+ 126.0 362.310 514
+ 127.0 300.450 427
+ 139.0 123.338 175
+ 140.0 233.340 331
+ 141.0 235.767 335
+ 153.0 107.469 153
+ 155.0 163.241 232
+ 169.0 130.189 185
+ 182.0 94.773 135
+ 183.0 160.234 228
+ 196.0 132.555 188
+ 197.0 163.622 232
+ 211.0 75.654 107
+ 278.0 326.649 464
+ 279.0 220.386 313
+ 280.0 227.649 323
+ 281.0 143.572 204
+ 296.0 158.358 225
+ 297.0 60.502 86
+ 309.0 35.370 50
+ 325.0 279.819 397
+ 326.0 71.000 101
+ 340.0 133.760 190
+ 341.0 60.486 86
+ 343.0 243.579 346
+ 344.0 60.028 85
+ 354.0 51.468 73
+ 373.0 132.555 188
+ 374.0 42.069 60
+ 383.0 63.767 91
+ 634.0 450.446 640
+ 635.0 212.497 302
+ 636.0 49.622 70
+//
diff --git a/tests/test_data/mona_msp.msp b/tests/test_data/mona_msp.msp
new file mode 100644
index 0000000..b1f6d21
--- /dev/null
+++ b/tests/test_data/mona_msp.msp
@@ -0,0 +1,580 @@
+Name: Sulfaclozine
+Synon: 4-amino-N-(6-chloropyrazin-2-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100601
+InChIKey: QKLPUVXBJHRFQZ-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 285.0208
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: Ramp 21.1-31.6 eV
+Formula: C10H9ClN4O2S
+MW: 284
+ExactMass: 284.013474208
+Comments: "accession=AU100601" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=284.0135" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=Ramp 21.1-31.6 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.6 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=285.0208" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.17469602228006656" "mass error=4.9792000027082395E-5" "SMILES=c1cc(ccc1N)S(=O)(=O)Nc2cncc(n2)Cl" "cas=102-65-8" "pubchem cid=66890" "chemspider=60252" "InChI=InChI=1S/C10H9ClN4O2S/c11-9-5-13-6-10(14-9)15-18(16,17)8-3-1-7(12)2-4-8/h1-6H,12H2,(H,14,15)" "InChIKey=QKLPUVXBJHRFQZ-UHFFFAOYSA-N" "molecular formula=C10H9ClN4O2S" "total exact mass=284.013474208" "SMILES=C=1C=C(C=CC1N)S(N=C2C=NC=C(Cl)N2)(=O)=O"
+Num Peaks: 27
+53.0389 0.594951
+54.0333 0.566811
+55.0178 0.522592
+60.0552 0.542692
+65.0382 3.822962
+66.0423 0.506512
+68.049 7.963499
+78.0333 0.727609
+79.0177 1.057244
+92.0498 7.702203
+93.0532 0.731629
+96.0443 0.623091
+108.0457 12.172375
+109.0483 1.181862
+110.0609 4.904325
+120.0562 3.095353
+130.0172 5.656054
+132.0138 1.515517
+156.0118 100.000000
+157.015 8.884065
+158.008 3.891301
+174.0228 0.751729
+184.0757 0.619071
+191.9647 0.590931
+219.0438 0.723589
+285.0221 3.694324
+287.0184 0.840167
+
+
+Name: Sulfachlorpyridazine
+Synon: 4-amino-N-(6-chloropyridazin-3-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100701
+InChIKey: XOXHILFPRYWFOD-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 285.0208
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: Ramp 21.1-31.6 eV
+Formula: C10H9ClN4O2S
+MW: 284
+ExactMass: 284.013474208
+Comments: "accession=AU100701" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=284.0135" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=Ramp 21.1-31.6 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.6 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=285.0208" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.17469602228006656" "mass error=4.9792000027082395E-5" "SMILES=c1cc(ccc1N)S(=O)(=O)Nc2ccc(nn2)Cl" "cas=80-32-0" "pubchem cid=6634" "chemspider=6382" "InChI=InChI=1S/C10H9ClN4O2S/c11-9-5-6-10(14-13-9)15-18(16,17)8-3-1-7(12)2-4-8/h1-6H,12H2,(H,14,15)" "InChIKey=XOXHILFPRYWFOD-UHFFFAOYSA-N" "molecular formula=C10H9ClN4O2S" "total exact mass=284.013474208" "SMILES=C=1C=C(C=CC1N)S(NC=2C=CC(Cl)=NN2)(=O)=O"
+Num Peaks: 27
+53.0389 0.594951
+54.0333 0.566811
+55.0178 0.522592
+60.0552 0.542692
+65.0382 3.822962
+66.0423 0.506512
+68.049 7.963499
+78.0333 0.727609
+79.0177 1.057244
+92.0498 7.702203
+93.0532 0.731629
+96.0443 0.623091
+108.0457 12.172375
+109.0483 1.181862
+110.0609 4.904325
+120.0562 3.095353
+130.0172 5.656054
+132.0138 1.515517
+156.0118 100.000000
+157.015 8.884065
+158.008 3.891301
+174.0228 0.751729
+184.0757 0.619071
+191.9647 0.590931
+219.0438 0.723589
+285.0221 3.694324
+287.0184 0.840167
+
+
+Name: Sulfadimidine
+Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100801
+InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 279.091
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: Ramp 20.8-31.3 eV
+Formula: C12H14N4O2S
+MW: 278
+ExactMass: 278.08374668799996
+Comments: "accession=AU100801" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=Ramp 20.8-31.3 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.4 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248146496051" "mass error=-2.2687999944537296E-5" "SMILES=Cc1cc(nc(n1)NS(=O)(=O)c2ccc(cc2)N)C" "cas=57-68-1" "kegg=C19530" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.08374668799996" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 46
+53.0379 0.894101
+54.0335 0.661867
+55.0176 0.598003
+65.0379 8.717487
+68.0491 13.013818
+69.0329 1.640153
+78.0334 1.477589
+79.0178 2.261379
+80.0489 1.431143
+81.0444 1.950766
+82.0284 0.606712
+92.0499 30.585230
+93.0558 2.844868
+94.0647 1.686600
+95.0608 3.027752
+96.0443 1.300511
+108.0461 33.946818
+109.0497 2.360079
+110.0616 6.107757
+111.0651 0.519624
+120.0565 1.962378
+122.0716 6.078727
+123.0794 2.246865
+124.0872 71.211681
+125.0905 6.398049
+126.0663 17.911054
+127.0697 0.595100
+156.0117 82.855318
+157.0148 5.739085
+158.0072 1.544357
+174.0224 1.106015
+186.0334 11.263353
+187.0368 0.775081
+188.0128 1.637250
+188.0291 0.534138
+204.0445 100.000000
+205.0473 6.972829
+206.0406 3.358686
+213.1141 18.259405
+214.1167 2.241059
+215.0927 3.071296
+215.1291 1.320831
+279.0925 61.483976
+280.0953 8.438806
+281.0725 7.837901
+282.0742 1.222132
+
+
+Name: Sulfamethazine
+Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100802
+InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 279.091
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 20 eV
+Formula: C12H14N4O2S
+MW: 278
+ExactMass: 278.08374668799996
+Comments: "accession=AU100802" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=20 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.1 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248146496051" "mass error=-2.2687999944537296E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.08374668799996" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 16
+122.0703 0.766124
+124.0861 36.693459
+125.0892 1.930893
+149.0227 0.828453
+156.0104 53.249536
+157.0129 2.999571
+158.0061 1.778967
+174.0209 0.627183
+186.0321 22.621444
+187.0346 1.719235
+188.0285 0.646661
+204.0431 100.000000
+213.1128 8.749399
+214.1159 1.407591
+215.1281 0.658348
+279.0909 80.894937
+
+
+Name: Sulfamethazine
+Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100803
+InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 279.091
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 30 eV
+Formula: C12H14N4O2S
+MW: 278
+ExactMass: 278.083746688
+Comments: "accession=AU100803" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=30 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.2 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248166863394" "mass error=-2.2688000001380715E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.083746688" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 17
+108.0441 1.285794
+122.0704 6.630847
+123.0781 2.170942
+124.0861 100.000000
+125.0889 6.093546
+149.0221 1.388285
+156.0106 50.043481
+158.0064 1.615007
+186.0323 15.118951
+187.0355 1.323064
+196.0858 1.220573
+204.0429 70.964035
+205.0455 4.931983
+213.1123 22.610100
+214.1155 3.003292
+215.1283 0.804398
+279.0903 3.580968
+
+
+Name: Sulfamethazine
+Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100804
+InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 279.091
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 40 eV
+Formula: C12H14N4O2S
+MW: 278
+ExactMass: 278.083746688
+Comments: "accession=AU100804" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=40 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.1 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248166863394" "mass error=-2.2688000001380715E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.083746688" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 22
+108.0445 1.153673
+122.0702 5.323878
+123.0772 2.202467
+124.0862 100.000000
+125.089 6.847126
+134.0701 0.714179
+149.0224 1.747990
+154.0624 0.644259
+155.0685 0.624282
+156.0104 10.373071
+157.0126 0.933926
+172.0852 0.564351
+186.0324 3.845578
+196.0852 5.209010
+197.0903 1.378415
+198.0888 2.362283
+204.0427 15.422264
+205.0463 0.869001
+206.0375 0.759127
+212.1036 0.659242
+213.1121 18.109174
+214.1152 2.577036
+
+
+Name: Sulfamethazine
+Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100805
+InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 279.091
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 50 eV
+Formula: C12H14N4O2S
+MW: 278
+ExactMass: 278.083746688
+Comments: "accession=AU100805" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=50 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.2 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248166863394" "mass error=-2.2688000001380715E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.083746688" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 24
+108.0453 1.770916
+122.0703 2.803951
+123.078 2.792598
+124.0859 100.000000
+125.0891 7.901010
+149.0231 1.623340
+154.0639 2.111477
+155.0605 2.463390
+155.0714 2.690430
+156.01 2.713134
+169.0745 1.475763
+171.0781 1.555228
+172.0869 1.271427
+181.0634 0.930866
+186.1022 1.033034
+195.0786 1.555228
+196.0859 7.628562
+197.0856 3.871041
+198.0886 5.903054
+199.0904 0.998978
+204.0438 2.622318
+212.1048 2.327165
+213.1122 9.342718
+214.1153 1.725508
+
+
+Name: Sulfamethazine
+Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100806
+InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 279.091
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 10 eV
+Formula: C12H14N4O2S
+MW: 278
+ExactMass: 278.08374668799996
+Comments: "accession=AU100806" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=10 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.2 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248146496051" "mass error=-2.2687999944537296E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.08374668799996" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 4
+124.086 0.740586
+156.0098 1.123942
+186.0319 0.831793
+279.0908 100.000000
+
+
+Name: Sulfadimethoxine
+Synon: 4-amino-n-(2,6-dimethoxypyrimidin-4-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100902
+InChIKey: ZZORFUFYDOWNEF-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 311.0809
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 20 eV
+Formula: C12H14N4O4S
+MW: 310
+ExactMass: 310.07357592799997
+Comments: "accession=AU100902" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0735759" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=20 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.6 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214911901536" "mass error=4.8072000026877504E-5" "SMILES=COc1cc(nc(n1)OC)NS(=O)(=O)c2ccc(cc2)N" "cas=122-11-2" "chebi=32161" "pubchem=5323" "chemspider=5132" "InChI=InChI=1S/C12H14N4O4S/c1-19-11-7-10(14-12(15-11)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ZZORFUFYDOWNEF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.07357592799997" "SMILES=COC=1C=C(N=C(N1)OC)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 15
+140.0447 6.249276
+141.0515 0.699085
+154.0604 5.781932
+155.0683 3.398864
+156.0107 100.000000
+156.0763 16.893901
+157.0134 4.171334
+157.0794 0.857441
+158.0069 2.371480
+218.0242 0.965586
+245.1032 9.010853
+246.1061 0.834267
+311.0811 75.335059
+312.0835 7.145340
+313.0796 2.301958
+
+
+Name: Sulfadimethoxine
+Synon: 4-amino-n-(2,6-dimethoxypyrimidin-4-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100903
+InChIKey: ZZORFUFYDOWNEF-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 311.0809
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 30 eV
+Formula: C12H14N4O4S
+MW: 310
+ExactMass: 310.073575928
+Comments: "accession=AU100903" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0735759" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=30 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.6 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214893628664" "mass error=4.8071999970034085E-5" "SMILES=COc1cc(nc(n1)OC)NS(=O)(=O)c2ccc(cc2)N" "cas=122-11-2" "chebi=32161" "pubchem=5323" "chemspider=5132" "InChI=InChI=1S/C12H14N4O4S/c1-19-11-7-10(14-12(15-11)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ZZORFUFYDOWNEF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.073575928" "SMILES=COC=1C=C(N=C(N1)OC)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 21
+108.0448 1.310092
+124.0204 1.354502
+126.0659 3.563895
+127.0504 0.843788
+138.0294 1.576552
+141.0517 10.458532
+154.0604 60.575108
+155.0672 5.484623
+156.0105 100.000000
+156.0762 63.495059
+157.0131 4.540913
+157.0798 3.452870
+158.0071 2.320417
+201.0772 2.720107
+212.069 3.896969
+218.0235 0.843788
+230.0808 9.270567
+231.0843 1.232375
+245.1039 10.447430
+246.107 1.176862
+311.0829 3.819252
+
+
+Name: Sulfadimethoxine
+Synon: 4-amino-n-(2,6-dimethoxypyrimidin-4-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100904
+InChIKey: ZZORFUFYDOWNEF-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 311.0809
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 40 eV
+Formula: C12H14N4O4S
+MW: 310
+ExactMass: 310.073575928
+Comments: "accession=AU100904" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0735759" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=40 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.7 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214893628664" "mass error=4.8071999970034085E-5" "SMILES=COc1cc(nc(n1)OC)NS(=O)(=O)c2ccc(cc2)N" "cas=122-11-2" "chebi=32161" "pubchem=5323" "chemspider=5132" "InChI=InChI=1S/C12H14N4O4S/c1-19-11-7-10(14-12(15-11)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ZZORFUFYDOWNEF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.073575928" "SMILES=COC=1C=C(N=C(N1)OC)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 27
+112.0515 2.118270
+123.0436 2.184466
+124.0205 1.897617
+124.0508 2.162401
+126.0666 5.803177
+127.0502 2.030009
+132.0558 1.963813
+138.0295 4.898500
+140.045 77.780229
+141.0524 38.989409
+142.058 2.449250
+154.0604 100.000000
+155.0634 5.383936
+156.0104 20.101500
+156.0407 4.236540
+156.0761 54.744042
+157.0639 1.809356
+160.049 1.985878
+178.0597 3.420124
+201.077 8.274492
+202.0789 1.787290
+212.0697 15.114740
+213.0728 2.581642
+229.0713 2.206531
+230.0797 6.421006
+231.0852 1.919682
+245.1026 1.919682
+
+
+Name: Sulfadimethoxine
+Synon: 4-amino-n-(2,6-dimethoxypyrimidin-4-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU100905
+InChIKey: ZZORFUFYDOWNEF-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 311.0809
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 50 eV
+Formula: C12H14N4O4S
+MW: 310
+ExactMass: 310.07357592799997
+Comments: "accession=AU100905" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0735759" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=50 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.7 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214911901536" "mass error=4.8072000026877504E-5" "SMILES=COc1cc(nc(n1)OC)NS(=O)(=O)c2ccc(cc2)N" "cas=122-11-2" "chebi=32161" "pubchem=5323" "chemspider=5132" "InChI=InChI=1S/C12H14N4O4S/c1-19-11-7-10(14-12(15-11)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ZZORFUFYDOWNEF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.07357592799997" "SMILES=COC=1C=C(N=C(N1)OC)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 22
+112.051 5.243790
+123.0427 7.773689
+124.0502 6.439742
+126.0287 5.841766
+126.0666 6.255750
+127.0491 3.955842
+132.0559 9.521619
+133.0628 5.105796
+138.0293 10.579577
+140.045 45.768169
+141.0521 46.780129
+142.0539 3.817847
+154.0606 100.000000
+156.0102 3.679853
+156.0405 5.243790
+156.0769 17.157314
+157.0629 5.887764
+160.0507 3.909844
+178.0613 9.429623
+184.0741 4.737810
+201.0768 9.015639
+212.0705 7.589696
+
+
+Name: Sulfadoxine
+Synon: 4-amino-N-(5,6-dimethoxypyrimidin-4-yl)benzenesulfonamide
+SYNON: $:00in-source
+DB#: AU101001
+InChIKey: PJSFRIWCGOHTNF-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 311.0809
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: Ramp 21.8-32.7 eV
+Formula: C12H14N4O4S
+MW: 310
+ExactMass: 310.07357592799997
+Comments: "accession=AU101001" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0736" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=Ramp 21.8-32.7 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.8 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214911901536" "mass error=4.8072000026877504E-5" "SMILES=COc1c(ncnc1OC)NS(=O)(=O)c2ccc(cc2)N" "cas=2447-57-6" "kegg=C07630" "pubchem cid=17134" "chemspider=16218" "InChI=InChI=1S/C12H14N4O4S/c1-19-10-11(14-7-15-12(10)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=PJSFRIWCGOHTNF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.07357592799997" "SMILES=COC1=C(N=CN=C1OC)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 42
+53.0386 0.535490
+54.0339 0.505437
+65.0381 7.755041
+68.0491 10.088247
+69.0329 1.049123
+78.0332 1.038195
+79.0179 1.721217
+80.0363 0.707612
+80.0493 1.446642
+92.0498 46.272062
+93.0559 3.808535
+96.0447 1.331894
+108.0463 57.395771
+109.049 3.816731
+109.0643 0.531392
+110.0614 7.111633
+113.0359 0.703513
+120.0568 1.860554
+124.0215 1.529971
+124.0512 0.572373
+126.0665 2.939730
+138.0301 0.707612
+140.0457 34.351948
+141.0528 5.744222
+154.0615 32.562428
+155.0682 7.798754
+156.0118 100.000000
+156.0771 40.377575
+157.0147 7.961314
+157.0796 2.486203
+158.0078 3.766188
+201.0773 1.349653
+212.0697 2.576362
+213.0752 0.527294
+218.0236 1.945249
+230.0808 5.531119
+231.085 0.811431
+245.1045 18.128791
+246.1073 2.479373
+311.0829 49.986340
+312.0854 8.491339
+313.0812 2.222556
+
+
+Name: Sulfadiazine
+Synon: 4-amino-n-pyrimidin-2-ylbenzenesulfonamide
+SYNON: $:00in-source
+DB#: AU101101
+InChIKey: SEEPANYCNGTZFQ-UHFFFAOYSA-N
+Precursor_type: [M+H]+
+Spectrum_type: MS2
+PrecursorMZ: 251.0597
+Instrument_type: LC-ESI-QTOF
+Instrument: Bruker maXis Impact
+Ion_mode: P
+Collision_energy: 10 eV
+Formula: C10H10N4O2S
+MW: 250
+ExactMass: 250.05244656
+Comments: "accession=AU101101" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=250.0524466" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=10 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=3.3 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=251.0597" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08985910518808851" "mass error=-2.2559999990789947E-5" "SMILES=c1cnc(nc1)NS(=O)(=O)c2ccc(cc2)N" "cas=141582-64-1" "chebi=9328" "kegg=C07658" "pubchem=5215" "chemspider=5026" "InChI=InChI=1S/C10H10N4O2S/c11-8-2-4-9(5-3-8)17(15,16)14-10-12-6-1-7-13-10/h1-7H,11H2,(H,12,13,14)" "InChIKey=SEEPANYCNGTZFQ-UHFFFAOYSA-N" "molecular formula=C10H10N4O2S" "total exact mass=250.05244656" "SMILES=C1=CN=C(N=C1)NS(C2=CC=C(C=C2)N)(=O)=O"
+Num Peaks: 6
+156.0106 9.361897
+174.0199 0.724251
+176.012 0.693756
+251.0596 100.000000
+252.0616 7.867653
+253.0565 2.729283
\ No newline at end of file
diff --git a/tests/test_databases.py b/tests/test_databases.py
index daae038..20c7ff7 100644
--- a/tests/test_databases.py
+++ b/tests/test_databases.py
@@ -25,6 +25,7 @@
import shutil
import pickle
from metaboblend.databases import *
+from metaboblend.parse import reformat_xml
class DatabasesTestCase(unittest.TestCase):
diff --git a/tests/test_isomorphism_database.py b/tests/test_isomorphism_database.py
index 96ff705..6ad11a4 100644
--- a/tests/test_isomorphism_database.py
+++ b/tests/test_isomorphism_database.py
@@ -21,6 +21,7 @@
import os
+import sys
import unittest
import shutil
import tempfile
@@ -45,46 +46,50 @@ def setUpClass(cls):
shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"),
cls.to_test_results("test_data"))
+ def test_create_connectivity_database(self):
+
pkg_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
if sys.platform == "win32" or sys.platform == "win64": # TODO: add RI as dependency
- cls.path_ri = os.path.join(pkg_path, "tools", "RI_win", "RI3.6-release", "ri36")
+ self.path_ri = os.path.join(pkg_path, "tools", "RI_win", "RI3.6-release", "ri36")
- elif sys.platform == "darwin":
- cls.path_ri = os.path.join(pkg_path, "tools", "RI_mac", "RI3.6-release", "ri36")
+ else:
- elif sys.platform == "linux2":
- if "bb" in "socket.gethostname":
- cls.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36")
- else:
- cls.path_ri = os.path.join(pkg_path, "tools", "RI_bb", "RI3.6-release", "ri36")
+ if sys.platform == "darwin":
+ self.path_ri = os.path.join(pkg_path, "tools", "RI_mac", "RI3.6-release", "ri36")
- elif sys.platform == "linux":
- cls.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36")
+ elif sys.platform == "linux2":
+ if "bb" in "socket.gethostname":
+ self.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36")
+ else:
+ self.path_ri = os.path.join(pkg_path, "tools", "RI_bb", "RI3.6-release", "ri36")
- create_connectivity_database(cls.to_test_results("connectivity.sqlite"),
- 3, # sizes
- [1, 2], # boxes
- cls.path_ri
- )
+ elif sys.platform == "linux":
+ self.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36")
- def test_create_connectivity_database(self):
- ref_db = sqlite3.connect(self.to_test_data("connectivity.sqlite"))
- ref_db_cursor = ref_db.cursor()
- ref_db_cursor.execute("SELECT * FROM subgraphs")
+ create_connectivity_database(self.to_test_results("connectivity.sqlite"),
+ 3, # sizes
+ [1, 2], # boxes
+ self.path_ri
+ )
+
+ ref_db = sqlite3.connect(self.to_test_data("connectivity.sqlite"))
+ ref_db_cursor = ref_db.cursor()
+ ref_db_cursor.execute("SELECT * FROM subgraphs")
- test_db = sqlite3.connect(self.to_test_results("connectivity.sqlite"))
- test_db_cursor = test_db.cursor()
- test_db_cursor.execute("SELECT * FROM subgraphs")
+ test_db = sqlite3.connect(self.to_test_results("connectivity.sqlite"))
+ test_db_cursor = test_db.cursor()
+ test_db_cursor.execute("SELECT * FROM subgraphs")
- ref_rows = {}
- for row in ref_db_cursor.fetchall():
- ref_rows[row[0]] = row
+ test_rows = {}
+ for row in test_db_cursor.fetchall():
+ test_rows[row[0]] = row
- for row in test_db_cursor.fetchall():
- self.assertEqual(row, ref_rows[row[0]])
+ for row in ref_db_cursor.fetchall():
+ self.assertEqual(row, test_rows[row[0]])
- ref_db.close()
- test_db.close()
+ ref_db.close()
+ test_db.close()
if __name__ == '__main__':
diff --git a/tests/test_parse.py b/tests/test_parse.py
new file mode 100644
index 0000000..513fb68
--- /dev/null
+++ b/tests/test_parse.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019-2020 Ralf Weber
+#
+# This file is part of MetaboBlend.
+#
+# MetaboBlend is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# MetaboBlend is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with MetaboBlend. If not, see .
+#
+
+
+import os
+import copy
+import shutil
+import tempfile
+import unittest
+from metaboblend.parse import *
+
+
+class IsomorphDbTestCase(unittest.TestCase):
+ temp_results_dir = None
+
+ @classmethod
+ def to_test_results(cls, *args):
+ return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args)
+
+ @classmethod
+ def to_test_data(cls, *args):
+ return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args)
+
+ @classmethod
+ def setUpClass(cls):
+ cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__)))
+
+ shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"),
+ cls.to_test_results("test_data"))
+
+ cls.neutral_fragment_masses = [155.00332400000002, 173.01262400000002, 175.004724,
+ 250.052324, 251.054324, 252.049224]
+ cls.exact_mass = 250.052424
+ cls.mf = [10, 10, 4, 2, 0, 1]
+ cls.precursor_mz = 251.0597
+ cls.fragment_mzs = [156.0106, 174.0199, 176.012, 251.0596, 252.0616, 253.0565]
+
+ def test_parse_msp(self):
+ for i, ms in enumerate(parse_msp(self.to_test_data("mona_msp.msp"))):
+
+ if i < 2:
+ self.assertEqual(ms, None)
+ else:
+ self.assertNotEqual(ms, None)
+
+ self.assertEqual(ms, {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz,
+ "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+",
+ "exact_mass": self.exact_mass, "neutral_fragment_masses": self.neutral_fragment_masses})
+
+ self.assertEqual(list(parse_msp(self.to_test_data("massbank_msp.txt")))[0], None)
+
+ # ensure that parse_msp provides same output as parse_ms_data when providing an msn file
+ for parse_msp_dict, parse_ms_dict in zip(parse_msp(self.to_test_data("mona_msp.msp")),
+ parse_ms_data(self.to_test_data("mona_msp.msp"))):
+
+ self.assertEqual(parse_msp_dict, parse_ms_dict)
+
+ def test_parse_ms_data(self):
+
+ # exact mass and neutral fragment masses should not be overwritten by parse_ms_data
+ full_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz,
+ "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+", "exact_mass": "abcd",
+ "neutral_fragment_masses": ["a", "b", "c", "d"]}
+
+ self.assertEqual(list(parse_ms_data({"AU101101": copy.deepcopy(full_ms_dict)}))[0], full_ms_dict)
+
+ # if exact mass is present should not be overwritten by parse_ms_data
+ exact_mass_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz,
+ "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+", "exact_mass": "abc"}
+
+ parsed_exact_mass_ms_dict = list(parse_ms_data({"test": copy.deepcopy(exact_mass_ms_dict)}))[0]
+ exact_mass_ms_dict["ms_id"] = "test"
+ exact_mass_ms_dict["neutral_fragment_masses"] = self.neutral_fragment_masses
+ self.assertEqual(parsed_exact_mass_ms_dict, exact_mass_ms_dict)
+
+ # neutral fragment masses should not be overwritten by parse_ms_data
+ neutral_fragment_masses_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz,
+ "precursor_type": "[M+H]+", "fragment_mzs": self.fragment_mzs,
+ "neutral_fragment_masses": ["a", "b", "c", "d"]}
+
+ parsed_neutral_fragment_masses_ms_dict = list(parse_ms_data({"AU101101": copy.deepcopy(neutral_fragment_masses_ms_dict)}))[0]
+ neutral_fragment_masses_ms_dict["exact_mass"] = self.exact_mass
+ self.assertEqual(parsed_neutral_fragment_masses_ms_dict, neutral_fragment_masses_ms_dict)
+
+ uncalculated_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz,
+ "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+"}
+ parsed_uncalculated_ms_dict = list(parse_ms_data({"AU101101": copy.deepcopy(uncalculated_ms_dict)}))[0]
+ uncalculated_ms_dict["exact_mass"] = self.exact_mass
+ uncalculated_ms_dict["neutral_fragment_masses"] = self.neutral_fragment_masses
+ self.assertEqual(parsed_uncalculated_ms_dict, uncalculated_ms_dict)
+
+ # test with msn=False
+ generate_structures_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz,
+ "prescribed_mass": "m", "precursor_type": "[M+H]+"}
+ parsed_generate_structures_dict = list(parse_ms_data({"AU101101": copy.deepcopy(generate_structures_dict)}, False))[0]
+ generate_structures_dict["exact_mass"] = self.exact_mass
+ self.assertEqual(parsed_generate_structures_dict, generate_structures_dict)
+
+ # test with exact mass provided
+ generate_structures_dict["exact_mass"] = "a"
+ parsed_generate_structures_dict = list(parse_ms_data({"AU101101": copy.deepcopy(generate_structures_dict)}, False))[0]
+ self.assertEqual(parsed_generate_structures_dict, generate_structures_dict)
+
+ def test_precursor_ions_to_neutral_masses(self):
+
+ ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz,
+ "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+"}
+
+ for which in ["both", "fragments", "precursor", "none"]:
+ processed_ms_dict = precursor_ions_to_neutral_masses(copy.deepcopy(ms_dict), which)
+
+ if which in ["both", "fragments"]:
+ self.assertEqual(processed_ms_dict["neutral_fragment_masses"], self.neutral_fragment_masses)
+
+ if which in ["both", "precursor"]:
+ self.assertEqual(processed_ms_dict["exact_mass"], self.exact_mass)
+
+ ms_dict["precursor_type"] = "[M-H]-"
+
+ for which in ["both", "fragments", "precursor", "none"]:
+ processed_ms_dict = precursor_ions_to_neutral_masses(copy.deepcopy(ms_dict), which)
+
+ if which in ["both", "fragments"]:
+ neutral_fragment_masses = [nfm + 1.007276 for nfm in self.fragment_mzs]
+ self.assertEqual(processed_ms_dict["neutral_fragment_masses"], neutral_fragment_masses)
+
+ if which in ["both", "precursor"]:
+ self.assertEqual(processed_ms_dict["exact_mass"], self.precursor_mz + 1.007276)
+
+ def test_reformat_msp_input(self):
+
+ unformatted_msp_dict = {'ms_id': 'AU101101', 'mf': 'C10H10N4O2S', 'precursor_mz': '251.0597',
+ 'fragment_mzs': self.fragment_mzs,
+ 'precursor_type': '[M+H]+'}
+
+ formatted_msp_dict = {'ms_id': 'AU101101', 'mf': self.mf, 'precursor_mz': self.precursor_mz,
+ 'fragment_mzs': self.fragment_mzs, 'precursor_type': '[M+H]+',
+ 'exact_mass': self.exact_mass,
+ 'neutral_fragment_masses': self.neutral_fragment_masses}
+
+ self.assertEqual(reformat_msp_input(unformatted_msp_dict), formatted_msp_dict)
+
+ unformatted_msp_dict["precursor_mz"] = None
+ self.assertWarns(UserWarning, reformat_msp_input(unformatted_msp_dict))
+
+ unformatted_msp_dict["precursor_mz"] = self.precursor_mz
+ unformatted_msp_dict["fragment_mzs"] = []
+ self.assertWarns(UserWarning, reformat_msp_input(unformatted_msp_dict))
+
+ def test_mc_to_list(self):
+
+ mc_lists = [[12, 14, 4, 4, 0, 1], [10, 10, 4, 2, 0, 1], [46, 94, 1, 8, 1, 0], [46, 94, 1, 8, 1, 0], None]
+
+ for i, word_formula in enumerate(["C12H14N4O4S", "C10H10N4O2S", "C46H94NO8P", "C46H94NO8P1", "C10H9ClN4O2S"]):
+ self.assertEqual(mc_to_list(word_formula), mc_lists[i])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_suite_auxiliary.py b/tests/test_suite_auxiliary.py
index c44171d..cc211e2 100644
--- a/tests/test_suite_auxiliary.py
+++ b/tests/test_suite_auxiliary.py
@@ -27,6 +27,7 @@
from pathlib import Path
from . import test_auxiliary
+from . import test_parse
sys.path.insert(0, str(Path(__file__).parent.parent.resolve()))
@@ -35,6 +36,7 @@
suite = unittest.TestSuite()
suite.addTest(unittest.findTestCases(test_auxiliary))
+ suite.addTest(unittest.findTestCases(test_parse))
report = os.path.join(os.path.abspath(os.path.join(__file__, os.pardir)), 'results', 'results_test_suite_auxiliary')
runTestSuite(suite, report, title='Process Test Suite Report', verbosity=2)