diff --git a/metaboblend/algorithms.py b/metaboblend/algorithms.py new file mode 100644 index 0000000..624b00a --- /dev/null +++ b/metaboblend/algorithms.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import numpy + + +def find_path(mass_list, sum_matrix, n, mass, max_subset_length, path=[]): + """ + Recursive solution for backtracking through the dynamic programming boolean matrix. All possible subsets are found + + :param mass_list: A list of masses from which to identify subsets. + + :param mass: The target mass of the sum of the substructures. + + :param sum_matrix: The dynamic programming boolean matrix. + + :param n: The size of mass_list. + + :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to + terminate early in many cases, significantly improving runtime. + + :param path: List for keeping track of the current subset. + + :return: Generates of lists containing the masses of valid subsets. + """ + + # base case - the path has generated a correct solution + if mass == 0: + yield sorted(path) + return + + # stop running when we overshoot the mass + elif mass < 0: + return + + # can we sum up to the target value using the remaining masses? recursive call + elif sum_matrix[n][mass]: + yield from find_path(mass_list, sum_matrix, n - 1, mass, max_subset_length, path) + + if len(path) < max_subset_length: + path.append(mass_list[n-1]) + + yield from find_path(mass_list, sum_matrix, n - 1, mass - mass_list[n - 1], max_subset_length, path) + path.pop() + + +def subset_sum(mass_list, mass, max_subset_length=3): + """ + Dynamic programming implementation of subset sum. Note that, whilst this algorithm is pseudo-polynomial, the + backtracking algorithm for obtaining all possible subsets has exponential complexity and so remains unsuitable + for large input values. This does, however, tend to perform a lot better than non-sum_matrix implementations, as + we're no longer doing sums multiple times and we've cut down the operations performed during the exponential portion + of the method. + + :param mass_list: A list of masses from which to identify subsets. + + :param mass: The target mass of the sum of the substructures. + + :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to + terminate early in many cases, significantly improving runtime. + + :return: Generates of lists containing the masses of valid subsets. + """ + + n = len(mass_list) + + # initialise dynamic programming array + sum_matrix = numpy.ndarray([n + 1, mass + 1], bool) + + # subsets can always equal 0 + for i in range(n+1): + sum_matrix[i][0] = True + + # empty subsets do not have non-zero sums + for i in range(mass): + sum_matrix[0][i + 1] = False + + # fill in the remaining boolean matrix + for i in range(n): + for j in range(mass+1): + if j >= mass_list[i]: + sum_matrix[i + 1][j] = sum_matrix[i][j] or sum_matrix[i][j - mass_list[i]] + else: + sum_matrix[i + 1][j] = sum_matrix[i][j] + + # backtrack through the matrix recursively to obtain all solutions + return find_path(mass_list, sum_matrix, n, mass, max_subset_length) diff --git a/metaboblend/auxiliary.py b/metaboblend/auxiliary.py index 52717b8..4408224 100644 --- a/metaboblend/auxiliary.py +++ b/metaboblend/auxiliary.py @@ -20,8 +20,8 @@ # import itertools -import networkx as nx import pylab as plt +import networkx as nx def calculate_complete_multipartite_graphs(max_atoms_available, max_n_substructures): diff --git a/metaboblend/build_structures.py b/metaboblend/build_structures.py index 1fa4539..16987dc 100644 --- a/metaboblend/build_structures.py +++ b/metaboblend/build_structures.py @@ -20,103 +20,21 @@ # import os -import multiprocessing import copy +import numpy import itertools -from functools import partial +import multiprocessing import networkx as nx -import numpy -import sqlite3 -import csv +from functools import partial from operator import itemgetter from typing import Sequence, Dict, Union from rdkit import Chem -from .databases import SubstructureDb, get_elements, calculate_exact_mass - - -def find_path(mass_list, sum_matrix, n, mass, max_subset_length, path=[]): - """ - Recursive solution for backtracking through the dynamic programming boolean matrix. All possible subsets are found - - :param mass_list: A list of masses from which to identify subsets. - - :param mass: The target mass of the sum of the substructures. - - :param sum_matrix: The dynamic programming boolean matrix. - - :param n: The size of mass_list. - - :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to - terminate early in many cases, significantly improving runtime. - - :param path: List for keeping track of the current subset. - - :return: Generates of lists containing the masses of valid subsets. - """ - - # base case - the path has generated a correct solution - if mass == 0: - yield sorted(path) - return - - # stop running when we overshoot the mass - elif mass < 0: - return - - # can we sum up to the target value using the remaining masses? recursive call - elif sum_matrix[n][mass]: - yield from find_path(mass_list, sum_matrix, n - 1, mass, max_subset_length, path) - - if len(path) < max_subset_length: - path.append(mass_list[n-1]) - - yield from find_path(mass_list, sum_matrix, n - 1, mass - mass_list[n - 1], max_subset_length, path) - path.pop() - - -def subset_sum(mass_list, mass, max_subset_length=3): - """ - Dynamic programming implementation of subset sum. Note that, whilst this algorithm is pseudo-polynomial, the - backtracking algorithm for obtaining all possible subsets has exponential complexity and so remains unsuitable - for large input values. This does, however, tend to perform a lot better than non-sum_matrix implementations, as - we're no longer doing sums multiple times and we've cut down the operations performed during the exponential portion - of the method. - - :param mass_list: A list of masses from which to identify subsets. - - :param mass: The target mass of the sum of the substructures. - - :param max_subset_length: The maximum length of subsets to return. Allows the recursive backtracking algorithm to - terminate early in many cases, significantly improving runtime. - - :return: Generates of lists containing the masses of valid subsets. - """ - - n = len(mass_list) - - # initialise dynamic programming array - sum_matrix = numpy.ndarray([n + 1, mass + 1], bool) - - # subsets can always equal 0 - for i in range(n+1): - sum_matrix[i][0] = True - - # empty subsets do not have non-zero sums - for i in range(mass): - sum_matrix[0][i + 1] = False - - # fill in the remaining boolean matrix - for i in range(n): - for j in range(mass+1): - if j >= mass_list[i]: - sum_matrix[i + 1][j] = sum_matrix[i][j] or sum_matrix[i][j - mass_list[i]] - else: - sum_matrix[i + 1][j] = sum_matrix[i][j] - - # backtrack through the matrix recursively to obtain all solutions - return find_path(mass_list, sum_matrix, n, mass, max_subset_length) +from .results import ResultsDb +from .parse import parse_ms_data +from .algorithms import subset_sum +from .databases import SubstructureDb def combine_mfs(precise_mass_grp, db, table_name, accuracy): @@ -236,6 +154,9 @@ def add_bonds(mols, edges, atoms_available, bond_types, bond_enthalpies): * **2.0** Double + :param bond_enthalpies: Dictionary of bond enthalpies, as generated by + :py:meth:`metaboblend.build_structures.get_bond_enthalpies`. + :return: If unsuccessful, returns None, else returns an :py:meth:`rdkit.Chem.EditableMol` object containing the substructures combined into a final single molecule. """ @@ -275,11 +196,12 @@ def add_bonds(mols, edges, atoms_available, bond_types, bond_enthalpies): bt_start.remove(bond_matches[0]) bt_end.remove(bond_matches[0]) - try: + try: # try forming the specified bond mol_edit.AddBond(edge[0], edge[1], rdkit_bond_types[bond_matches[0]]) except KeyError: return None, None # unknown bond type + # calculate bond dissociation energy of "formed" bonds for the structure try: total_bde += bond_enthalpies[bond_matches[0]][mols.GetAtomWithIdx(edge[0]).GetSymbol()][mols.GetAtomWithIdx(edge[1]).GetSymbol()] except (SyntaxError, TypeError): @@ -288,285 +210,7 @@ def add_bonds(mols, edges, atoms_available, bond_types, bond_enthalpies): return mol_edit, total_bde -class ResultsDb: - """ - Methods for interacting with the SQLITE3 results database, as created by - :py:meth:`metaboblend.build_structures.annotate_msn`. - - :param path_results: Directory to which results will be written. - """ - - def __init__(self, path_results, msn=True): - """Constructor method.""" - - self.path_results = path_results - self.path_results_db = os.path.join(self.path_results, "metaboblend_results.sqlite") - self.msn = msn - - self.conn = None - self.cursor = None - - self.substructure_combo_id = 0 - - def connect(self): - """Connects to the results database.""" - - self.conn = sqlite3.connect(self.path_results_db) - self.cursor = self.conn.cursor() - - def create_results_db(self): - """Generates a new results database.""" - - if os.path.exists(self.path_results_db): - os.remove(self.path_results_db) - - self.connect() - - self.cursor.execute("""CREATE TABLE queries ( - ms_id_num INTEGER PRIMARY KEY, - ms_id TEXT, - exact_mass NUMERIC, - C INTEGER, - H INTEGER, - N INTEGER, - O INTEGER, - P INTEGER, - S INTEGER, - ppm INTEGER, - ha_min INTEGER, - ha_max INTEGER, - max_atoms_available INTEGER, - max_degree INTEGER, - max_n_substructures INTEGER, - hydrogenation_allowance INTEGER, - isomeric_smiles INTEGER)""") - - if self.msn: - self.cursor.execute("""CREATE TABLE spectra ( - ms_id_num INTEGER, - fragment_id INTEGER, - neutral_mass NUMERIC, - PRIMARY KEY (ms_id_num, fragment_id))""") - - self.cursor.execute("""CREATE TABLE structures ( - ms_id_num INTEGER, - structure_smiles TEXT, - frequency INTEGER, - PRIMARY KEY (ms_id_num, structure_smiles))""") - - self.cursor.execute("""CREATE TABLE substructures ( - substructure_combo_id INTEGER, - substructure_position_id INTEGER, - ms_id_num INTEGER, - structure_smiles TEXT, - fragment_id INTEGER, - substructure_smiles TEXT, - bde INTEGER, - PRIMARY KEY (substructure_combo_id, substructure_position_id))""") - - self.cursor.execute("""CREATE TABLE results ( - ms_id_num INTEGER, - fragment_id INTEGER, - structure_smiles TEXT, - bde INTEGER, - PRIMARY KEY(ms_id_num, fragment_id, structure_smiles))""") - - self.conn.commit() - - def add_ms(self, msn_data, ms_id, ms_id_num, parameters): - """ - Add entries to the `queries` and `spectra` tables. - - :param msn_data: Dictionary in the form - `msn_data[id] = {mf: [C, H, N, O, P, S], exact_mass: float, fragment_masses: []}`. id represents a unique - identifier for a given spectral tree or fragmentation spectrum, mf is a list of integers referring to the - molecular formula of the structure of interest, exact_mass is the mass of this molecular formula to >=4d.p. - and fragment_masses are neutral fragment masses generated by this structure used to inform candidate - scoring. See :py:meth:`metaboblend.build_structures.annotate_msn`. - - :param ms_id: Unique identifier for the annotation of a single metabolite. - - :param ms_id_num: Unique numeric identifier for the annotation of a single metaoblite. - - :param parameters: List of parameters, in the form: [ppm, ha_min, ha_max, max_atoms_available, max_degree, - max_n_substructures, hydrogenation_allowance, isomeric_smiles]. See - :py:meth:`metaboblend.build_structures.annotate_msn`. - """ - - for i, parameter in enumerate(parameters): - if parameter is None: - parameters[i] = "NULL" - elif isinstance(parameter, bool): - parameters[i] = int(parameter) - - self.cursor.execute("""INSERT INTO queries ( - ms_id, - ms_id_num, - exact_mass, - C, H, N, O, P, S, - ppm, - ha_min, - ha_max, - max_atoms_available, - max_degree, - max_n_substructures, - hydrogenation_allowance, - isomeric_smiles - ) VALUES ('{}', {}, {}, '{}', '{}', '{}', '{}', '{}', '{}', {})""".format( - ms_id, - ms_id_num, - msn_data[ms_id]["exact_mass"], - msn_data[ms_id]["mf"][0], msn_data[ms_id]["mf"][1], - msn_data[ms_id]["mf"][2], msn_data[ms_id]["mf"][3], - msn_data[ms_id]["mf"][4], msn_data[ms_id]["mf"][5], - ", ".join([str(p) for p in parameters]) - )) - - self.conn.commit() - - def add_results(self, ms_id_num, smi_dict, fragment_mass=None, fragment_id=None, retain_substructures=False): - """ - Record which smiles were generated for a given fragment mass. - - :param ms_id_num: Unique identifier for the annotation of a single metabolite. - - :param smi_dict: The fragment and substructure smiles generated by the annotation of a single peak for a single - metabolite. - - :param fragment_mass: The neutral fragment mass that has been annotated. - - :param fragment_id: The unique identifier for the fragment mass that has been annotated. - - :param retain_substructures: If True, record substructures in the results DB. - """ - - if self.msn: - self.cursor.execute("""INSERT OR IGNORE INTO spectra ( - ms_id_num, - fragment_id, - neutral_mass - ) VALUES ('{}', {}, {})""".format( - ms_id_num, - fragment_id, - fragment_mass - )) - else: - fragment_id = "NULL" - - for structure_smiles in smi_dict.keys(): - - self.cursor.execute("""INSERT OR IGNORE INTO results ( - ms_id_num, - fragment_id, - structure_smiles, - bde - ) VALUES ({}, {}, '{}', {})""".format( - ms_id_num, - fragment_id, - structure_smiles, - min(smi_dict[structure_smiles]["bdes"]) - )) - - if retain_substructures: - for i in range(len(smi_dict[structure_smiles]["substructures"])): # for each combination - - for j, substructure in enumerate(smi_dict[structure_smiles]["substructures"][i]): - - self.cursor.execute("""INSERT INTO substructures ( - substructure_combo_id, - substructure_position_id, - ms_id_num, - fragment_id, - structure_smiles, - substructure_smiles, - bde - ) VALUES ({}, {}, {}, {}, '{}', '{}', {})""".format( - self.substructure_combo_id, - j, - ms_id_num, - fragment_id, - structure_smiles, - substructure, - smi_dict[structure_smiles]["bdes"][i] - )) - - self.substructure_combo_id += 1 - - self.conn.commit() - - def calculate_frequencies(self, ms_id_num): - """ - Calculates structure frequencies in the SQLite DB. - - :param ms_id_num: Unique identifier for the annotation of a single metabolite. - """ - - self.cursor.execute("""INSERT INTO structures (ms_id_num, structure_smiles, frequency) - SELECT ms_id_num, structure_smiles, COUNT(*) - FROM results - WHERE ms_id_num = {} - GROUP BY structure_smiles""".format(ms_id_num)) - - def get_structures(self, ms_id_num): - """ - Gets smiles of generated structures. In the case of the MSn annotation workflow, also gets structure - frequencies. - - :param ms_id_num: Unique identifier for the annotation of a single metabolite. - - :return: In the case of simple structure generation, returns a set of smiles strings for output structures. - For the MSn annotation workflow, returns a dictionary with smiles as keys and the number of peaks for which - the smiles were generated as values. - """ - - if self.msn: - msn_str = ", frequency" - else: - msn_str = "" - - self.cursor.execute("""SELECT structure_smiles{} FROM structures - WHERE ms_id_num = {} - """.format(msn_str, ms_id_num)) - - if self.msn: - return [t for t in self.cursor.fetchall()] - else: - return [item for t in self.cursor.fetchall() for item in t] - - def generate_csv_output(self): - """ - Generate CSV file output for i) queries and tool parameters and ii) structures generated. - """ - - with open(os.path.join(self.path_results, "metaboblend_queries.csv"), "w", newline="") as results_file, \ - open(os.path.join(self.path_results, "metaboblend_structures.csv"), "w", newline="") as ms_file: - - results_writer = csv.writer(results_file, delimiter=",") - ms_writer = csv.writer(ms_file, delimiter=",") - - results_writer.writerow(["ms_id", "exact_mass", "C", "H", "N", "O", "P", "S", "ppm", "ha_min", "ha_max", - "max_atoms_available", "max_degree", "max_n_substructures", - "hydrogenation_allowance", "isomeric_smiles"]) - - self.cursor.execute("SELECT * FROM queries") - - for query in self.cursor.fetchall(): - results_writer.writerow(query) - - ms_writer.writerow(["ms_id", "smiles", "frequency", "exact_mass", "C", "H", "N", "O", "P", "S"]) - - self.cursor.execute("SELECT * FROM structures") - - for structure in self.cursor.fetchall(): - ms_writer.writerow(structure) - - def close(self): - """Close the connection to the SQLITE3 database.""" - - self.conn.close() - - -def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]], +def annotate_msn(msn_data: Union[str, os.PathLike, Dict[str, Dict[str, Union[int, list]]]], path_substructure_db: Union[str, bytes, os.PathLike] = os.path.realpath(os.getcwd()), path_out: Union[str, bytes, os.PathLike] = "", ppm: int = 5, @@ -590,11 +234,18 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]], text format. For the generation of structures without MSn data, see :py:meth:`metaboblend.build_structures.generate_structures`. - :param msn_data: Dictionary in the form - `msn_data[id] = {mf: [C, H, N, O, P, S], exact_mass: float, fragment_masses=[]}`. id represents a unique - identifier for a given spectral tree or fragmentation spectrum, mf is a list of integers referring to the - molecular formula of the structure of interest, exact_mass is the mass of this molecular formula to >=4d.p. - and fragment_masses are neutral fragment masses generated by this structure used to inform candidate scoring. + :param msn_data: Either a dictionary or the path to an MSP file. MSP files are parsed by + :py:meth:`metaboblend.parse.parse_ms_data` before being converted into a dictionary. If a dictionary is + provided, it must contain one item per fragmentation spectrum; the keys of the dictionary should be a unique ID + for the query and the corresponding value must itself be a dictionary, containing the following: + + - "exact_mass": `float` (neutral mass of query) OR "precursor_mz": `float` (mz of precursor ion) + - "mf": `[C, H, N, O, P, S]` (a list of 6 integers) + - "neutral_fragment_masses": `[float, float, ...]` (list of neutral fragment masses) OR "fragment_mzs": + `[float, float, ...]` (list of fragment mzs) + - "precursor_type": `str` (e.g. "[M+H]+", required for calculating neutral masses from ion mzs) + + The dictionary or MSP path is fed to :py:meth:`metaboverse.parse.parse_ms_data`. :param path_substructure_db: The path to the SQLite 3 substructure database, as generated by :py:meth:`metaboblend.databases.SubstructureDb`. @@ -671,22 +322,25 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]], max_degree=max_degree, max_atoms_available=max_atoms_available, minimum_frequency=minimum_frequency, - max_mass=round(max([msn_data[ms_id]["exact_mass"] for ms_id in msn_data.keys()])) + max_mass=None ) - for i, ms_id in enumerate(msn_data.keys()): + for i, ms in enumerate(parse_ms_data(msn_data)): + + if ms is None: + continue - results_db.add_ms(msn_data, ms_id, i, + results_db.add_ms(msn_data, ms["ms_id"], i, [ppm, ha_min, ha_max, max_atoms_available, max_degree, max_n_substructures, hydrogenation_allowance, isomeric_smiles]) - for j, fragment_mass in enumerate(msn_data[ms_id]["fragment_masses"]): + for j, fragment_mass in enumerate(ms["neutral_fragment_masses"]): for k in range(0 - hydrogenation_allowance, hydrogenation_allowance + 1): hydrogenated_fragment_mass = fragment_mass + (k * 1.007825) # consider re-arrangements smi_dict = build( - mf=msn_data[ms_id]["mf"], - exact_mass=msn_data[ms_id]["exact_mass"], + mf=ms["mf"], + exact_mass=ms["exact_mass"], max_n_substructures=max_n_substructures, path_connectivity_db=path_connectivity_db, path_substructure_db=path_substructure_db, @@ -705,7 +359,7 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]], results_db.calculate_frequencies(i) if yield_smis: - yield {ms_id: results_db.get_structures(i)} + yield {ms["ms_id"]: results_db.get_structures(i)} if write_csv_output: results_db.generate_csv_output() @@ -714,7 +368,7 @@ def annotate_msn(msn_data: Dict[str, Dict[str, Union[int, list]]], results_db.close() -def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]], +def generate_structures(ms_data: Union[str, os.PathLike, Dict[str, Dict[str, Union[int, None]]]], path_substructure_db: Union[str, bytes, os.PathLike], path_out: Union[str, bytes, os.PathLike] = os.path.realpath(os.getcwd()), ha_min: Union[int, None] = 2, @@ -736,11 +390,17 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]], text format. For the generation of structures from MSn data, see :py:meth:`metaboblend.build_structures.annotate_msn`. - :param ms_data: Dictionary in the form ms_data[id] = - `{mf: [C, H, N, O, P, S], exact_mass: float, prescribed_mass=int}`. id represents a unique identifier for - a given test, mf is a list of integers referring to molecular formula of the structure of interest, - exact_mass is the mass of this structure to >=4d.p. and prescribed_mass is the neutral mass of a substructure - used to limit structures generated. + :param ms_data: A dictionary that must contain one item per fragmentation spectrum; the keys of the dictionary + should be a unique ID for the query and the corresponding value must itself be a dictionary, containing the + following: + + - "exact_mass": `float` (neutral mass of query) OR "precursor_mz": `float` (mz of precursor ion) + - "mf": `[C, H, N, O, P, S]` (a list of 6 integers) + - "precursor_type": `str` (e.g. "[M+H]+", required for calculating neutral masses from ion mzs) + - (optional) "prescribed_mass": 'float' (neutral mass of substructure). + + The dictionary or MSP path is fed to :py:meth:`metaboverse.parse.parse_ms_data`. A single neutral substructure + mass may be provided ("prescribed_mass") to guide the structure generation process. :param path_substructure_db: The path to the SQLite 3 substructure database, as generated by :py:meth:`metaboblend.databases.SubstructureDb`. @@ -788,6 +448,8 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]], :param write_csv_output: Whether to extract results from the SQLite3 database for deposition in CSV files. + :param retain_substructures: Whether to record the substructures used to generate final structures. + :return: For each input molecule, yields unique SMILEs strings (unless `yield_smis = False`). """ @@ -809,26 +471,26 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]], max_mass=round(max([ms_data[ms_id]["exact_mass"] for ms_id in ms_data.keys()])) ) - for i, ms_id in enumerate(ms_data.keys()): + for i, ms in enumerate(parse_ms_data(ms_data, False)): - results_db.add_ms(ms_data, ms_id, i, + results_db.add_ms(ms_data, ms["ms_id"], i, [None, ha_min, ha_max, max_atoms_available, max_degree, max_n_substructures, None, isomeric_smiles]) ppm = None try: - if ms_data[ms_id]["prescribed_masses"] is not None: + if ms["prescribed_mass"] is not None: ppm = 0 except KeyError: - ms_data[ms_id]["prescribed_masses"] = None + ms["prescribed_mass"] = None smi_dict = build( - mf=ms_data[ms_id]["mf"], - exact_mass=ms_data[ms_id]["exact_mass"], + mf=ms["mf"], + exact_mass=ms["exact_mass"], max_n_substructures=max_n_substructures, path_connectivity_db=path_connectivity_db, path_substructure_db=path_substructure_db, - prescribed_mass=ms_data[ms_id]["prescribed_masses"], + prescribed_mass=ms["prescribed_mass"], ppm=ppm, table_name=table_name, ncpus=ncpus, @@ -837,13 +499,13 @@ def generate_structures(ms_data: Dict[str, Dict[str, Union[int, None]]], retain_substructures=retain_substructures ) - results_db.add_results(i, smi_dict, ms_data[ms_id]["prescribed_masses"]) + results_db.add_results(i, smi_dict, ms["prescribed_mass"]) smi_dict = None results_db.calculate_frequencies(i) if yield_smis: - yield {ms_id: results_db.get_structures(i)} + yield {ms["ms_id"]: results_db.get_structures(i)} if write_csv_output: results_db.generate_csv_output() @@ -1040,15 +702,21 @@ def gen_subs_table(db, ha_min, ha_max, max_degree, max_atoms_available, max_mass ha_max_statement = """ AND heavy_atoms <= %s""" % str(ha_max) + if max_mass is None: + max_mass_statment = "" + else: + max_mass_statment = """ + AND exact_mass__1 < %s""" % str(max_mass) + db.cursor.execute("""CREATE TABLE {} AS - SELECT * FROM substructures WHERE - atoms_available <= {} AND - valence <= {} AND - exact_mass__1 < {}{}{}{} + SELECT * + FROM substructures + WHERE atoms_available <= {} + AND valence <= {}{}{}{}{} """.format(table_name, max_atoms_available, max_degree, - max_mass, + max_mass_statment, freq_statement, ha_min_statement, ha_max_statement)) @@ -1136,6 +804,11 @@ def substructure_combination_build(substructure_subset, configs_iso, prescribed_ :param isomeric_smiles: True/False, should output smiles be written with isomeric information? + :param bond_enthalpies: Dictionary of bond enthalpies, as generated by + :py:meth:`metaboblend.build_structures.get_bond_enthalpies`. + + :param retain_substructures: Whether to record the substructures used to generate final structures. + :return: List of smiles representing molecules generated (and the substructures used to generate them). """ diff --git a/metaboblend/databases.py b/metaboblend/databases.py index 03d24a8..2c31d12 100644 --- a/metaboblend/databases.py +++ b/metaboblend/databases.py @@ -21,13 +21,10 @@ import io import os -import sys -import subprocess +import pickle import sqlite3 import tempfile -import pickle -from collections import OrderedDict -import xml.etree.ElementTree as ElementTree +import subprocess import networkx as nx from typing import Sequence, Dict, Union @@ -35,97 +32,10 @@ from rdkit.Chem import Recap from rdkit.Chem import BRICS +from .parse import parse_xml from .auxiliary import calculate_complete_multipartite_graphs, graph_to_ri, graph_info, sort_subgraphs -def reformat_xml(source, encoding="utf8"): - """ - Reformats HMDB xml files to be compatible with :py:meth:`metaboblend.databases.parse_xml`; some such files do not - contain a `` header. - - :param source: Path to file to be reformatted. - - :param encoding: Encoding of source file. - - :return: Source file destination. - """ - - with io.open(source, "r", encoding=encoding) as xml: - xml_contents = xml.readlines() - if "hmdb" in xml_contents[1]: - return source - - xml_contents.insert(1, " \n") - - with io.open(source, "w", encoding=encoding) as xml: - xml_contents = "".join(xml_contents) - xml.write(xml_contents) - xml.write("") - - return source - - -def parse_xml(source, encoding="utf8", reformat=False): - """ - Parses the contents of HMDB xml files to to extract information for the generation of substructures. - - :param source: Source file destination. - - :param encoding: Encoding of source file. - - :param reformat: Whether to apply :py:meth:`metaboblend.databases.reformat_xml` to the XML file. Is required for - XML files recording single metabolites. - - * **True** Add a `` header to the XML file before parsing. - - * **False** Parse the XML file as it is (recommended if header is present). - - :return: The XML file converted to a dictionary. - """ - - if reformat: - reformat_xml(source, encoding) - - with io.open(source, "r", encoding=encoding) as inp: - record_out = OrderedDict() - - inp.readline() - inp.readline() - - xml_record = "" - path = [] - - for line in inp: - xml_record += line - if line == "\n" or line == "\n": - - if sys.version_info[0] == 3: - inp = io.StringIO(xml_record) - else: - inp = io.BytesIO(xml_record.encode('utf-8').strip()) - - for event, elem in ElementTree.iterparse(inp, events=("start", "end")): - if event == 'end': - path.pop() - - if event == 'start': - path.append(elem.tag) - if elem.text is not None: - if elem.text.replace(" ", "") != "\n": - - path_elem = ".".join(map(str, path[1:])) - if path_elem in record_out: - if type(record_out[path_elem]) != list: - record_out[path_elem] = [record_out[path_elem]] - record_out[path_elem].append(elem.text) - else: - record_out[path_elem] = elem.text - - xml_record = "" - yield record_out - record_out = OrderedDict() - - class SubstructureDb: """ Methods for interacting with the SQLITE3 substructure and connectivity databases. Provides a connection to the diff --git a/metaboblend/parse.py b/metaboblend/parse.py new file mode 100644 index 0000000..5495a58 --- /dev/null +++ b/metaboblend/parse.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import io +import re +import sys +import copy +import warnings +from collections import OrderedDict +import xml.etree.ElementTree as ElementTree + + +def parse_ms_data(ms_data, msn=True): + """ + Parse raw data provided by user and yield formatted input data. Decides what type of data has been provided + (i.e. whether a dictionary has been given vs path to MSP file; if a dictionary, checks whether neutral masses + need to be calculated from precursor ions). + + :param ms_data: Dictionary containing input data or path to an MSP file. + + :param msn: If True, formats the data for use by :py:meth:`metaboblend.build_structures.annotate_msn`; else, formats + input data for use by :py:meth:`metaboblend.build_structures.generate_structures`. Only relevant if a + dictionary has been provided. + + :return: Yields a dictionary for use by build functions to generate structures. + """ + + if isinstance(ms_data, dict): + for i, ms_id in enumerate(ms_data.keys()): + + ms_data[ms_id]["ms_id"] = ms_id + + # check if user has provided a neutralised mass or ionised mz values + if "neutral_fragment_masses" in ms_data[ms_id].keys() and "exact_mass" in ms_data[ms_id].keys(): + which = "none" + + elif "exact_mass" in ms_data[ms_id].keys(): + if msn: + which = "fragments" + else: + which = "none" + + elif "neutral_fragment_masses" in ms_data[ms_id].keys() or not msn: + which = "precursor" + + else: + which = "both" + + yield precursor_ions_to_neutral_masses(ms_data[ms_id], which) + + else: + yield from parse_msp(ms_data) + + +def precursor_ion_to_neutral_mass(mass, precursor_type): + """ + Convert precursor ion to predicted neutral mass for substructure searching. + + :param mass: Charged mass to be neutralised. + + :param precursor_type: Type of precursor ion. + + :return: Neutral mass. + """ + + # conversions + precursor_dict = {"[M+H]+": 1.007276, + "[M+Na]+": 22.989221, + "[M+K]+": 38.963158, + "[M-H]-": -1.007276, + "[M+Cl]-": 34.969401, + "[M+Na-2H]-": 20.974668, + "[M+K-2H]-": 36.948605, + "[M+Hac-H]-": 59.013853} + + return mass - precursor_dict[precursor_type] + + +def precursor_ions_to_neutral_masses(ms_dict, which="both"): + """ + Convert precursor ion and fragment ions to neutral. + + :param ms_dict: Dictionary used by build functions to generate structures. Converts the precursor ion mass and/or + the fragment ions to their respective neutral masses. + + :param which: Whether to convert the precursor ion ("precursor"), the fragment ions ("fragments") or both ("both") + to their respective neutral masses. If which is "none", returns the original dictionary. + + :return: Returns `ms_dict` with additional items corresponding to neutralised masses. + """ + + if which == "precursor" or which == "both": + ms_dict["exact_mass"] = precursor_ion_to_neutral_mass(ms_dict["precursor_mz"], + ms_dict["precursor_type"]) + + if which == "fragments" or which == "both": + + ms_dict["neutral_fragment_masses"] = [] + + for fragment_ion_mass in ms_dict["fragment_mzs"]: + ms_dict["neutral_fragment_masses"].append(precursor_ion_to_neutral_mass(fragment_ion_mass, + ms_dict["precursor_type"])) + + return ms_dict + + +def parse_msp(msp_path): + """ + Parse msp files and yield data for each compound. Accepts MSP files in MoNa or MassBank format. We expect that + the following are provided in the MSP: + + - A unique accession ID. + - The molecular formula of the compound. + - The precursor mz representing the mass of the charged precursor ion. + - Fragment mzs representing masses of charged fragment ions. + - The type of precursor, e.g. "[M+H]+". + + Code adapted from `msp2db` (https://github.com/computational-metabolomics/msp2db/blob/master/msp2db/parse.py). + + :param msp_path: Path of an MSP file to be converted into a dictionary. + + :return: Dictionary in a form useable by :py:meth:`metaboblend.build_structures.annotate_msn` and + :py:meth:`metaboblend.build_structures.generate_structures`. + """ + + meta_parse = get_msp_regex() + reached_spectra = False + + empty_dict = {"ms_id": None, "mf": None, "precursor_mz": None, "precursor_type": None, "fragment_mzs": []} + entry_dict = copy.deepcopy(empty_dict) + + with open(msp_path, "r") as msp_file: + + for line in msp_file: + + line = re.sub('^(.{2}\\$)', "", line) # remove "XX$" from line start in massbank files + + if reached_spectra: + if line in ["\n", "\r\n", "//\n", "//\r\n", "", "//"]: # reached end of spectra + + yield reformat_msp_input(entry_dict) # completed entry ready for sending to build + + entry_dict = copy.deepcopy(empty_dict) + reached_spectra = False + + else: # add peak + entry_dict["fragment_mzs"].append(float(line.split()[0])) + + else: + for meta_type in meta_parse.keys(): + for meta_re in meta_parse[meta_type]: + + re_query = re.search(meta_re, line, re.IGNORECASE) + + if re_query: # TODO: walrus + entry_dict[meta_type] = re_query.group(1).strip() + + if re.match("^Num Peaks(.*)$", line, re.IGNORECASE) or re.match("^PEAK:(.*)", line, re.IGNORECASE): + reached_spectra = True # reached line prior to spectra + + if entry_dict != empty_dict: + yield reformat_msp_input(entry_dict) + + +def reformat_msp_input(entry_dict): + """ + Reformat input for use by build functions. + + :param entry_dict: Dictionary containing MSn information extracted from an MSP file (by + :py:meth:`metaboblend.parse.parse_msp`. The dictionary must contain the following: + + - ms_id - a unique accession number + - mf - the molecular formula of the compound (in the format "CXHXNXOXPXSX") + - precursor_mz - mz representing the mass of the charged precursor ion + - precursor_type - the type of precursor ion (e.g. "[M+H]+") + - fragment_mzs - mz(s) representing the mass of charged fragment ions + + :return: If the correct inputs were not provided in the MSP (and, hence, were not available in `entry_dict`), + returns None (and generates a warning with i) the accession (if available) and ii) the variable that was not + able to be extracted from the MSP). Else, returns the same dictionary after reformatting the molecular formula, + using :py:meth:`metaboblend.parse.mc_to_list`, and converting the precursor ions to their corresponding + neutral masses. + """ + + if entry_dict["mf"] is not None: # convert from C5H6... to [5, 6, ...] + entry_dict["mf"] = mc_to_list(entry_dict["mf"]) + + for key in ["ms_id", "mf", "precursor_mz", "precursor_type"]: # required for the tool to function + if entry_dict[key] is None: + if key == "ms_id": + warnings.warn("Entry ignored from MSP file due to lack of accession in MSP file") + else: + warnings.warn("Entry " + entry_dict["ms_id"] + " removed due to lack of valid " + key + " in MSP file") + return None + + entry_dict["precursor_mz"] = float(entry_dict["precursor_mz"]) + + if len(entry_dict["fragment_mzs"]) == 0: # require a spectra to annotate + warnings.warn("No fragments were identified for " + entry_dict["ms_id"] + " in MSP file") + return None + + return precursor_ions_to_neutral_masses(entry_dict) + + +def mc_to_list(mc): + """ + Convert molecular formula string to list format. + + :param mc: Molecular formula (in the format "C1H2N3O4P5S6") + + :return: Molecular formula (in the format `[1, 2, 3, 4, 5, 6]`) + """ + + if isinstance(mc, list): + return mc + + mc_list = [0, 0, 0, 0, 0, 0] + element_positions = {"C": 0, "H": 1, "N": 2, "O": 3, "P": 4, "S": 5} + + # seperates out the formula into [letter, number, letter, number, ...] + mc = re.findall(r"[A-Z][a-z]*|\d+", re.sub("[A-Z][a-z]*(?![\da-z])", r"\g<0>1", mc)) + + for i, substring in enumerate(mc): + + if i % 2 == 0: # in case of letter + try: + element_position = element_positions[substring] + except KeyError: # element not in C, H, N, O, P, S + return None + + else: # record number following the letter + mc_list[element_position] = int(substring) + + return mc_list + + +def get_msp_regex(): + """ Dictionary of regular expressions for parsing msp metadata. """ + + meta_parse = {"ms_id": ["^accession(?:=|:)(.*)$", "^DB#(?:=|:)(.*)$", "^ACCESSION:(.*)$"], # use accession as ms_id + "mf": ["^molecular formula(?:=|:)(.*)$", "^formula:(.*)$"], + "precursor_type": ["^precursor.*type(?:=|:)(.*)$", "^adduct(?:=|:)(.*)$", "^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$"], + "precursor_mz": ["^precursor m/z(?:=|:)\s*(\d*[.,]?\d*)$", "^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$", "^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$"]} + + return meta_parse + + +def reformat_xml(source, encoding="utf8"): + """ + Reformats HMDB xml files to be compatible with :py:meth:`metaboblend.databases.parse_xml`; some such files do not + contain a `` header. + + :param source: Path to file to be reformatted. + + :param encoding: Encoding of source file. + + :return: Source file destination. + """ + + with io.open(source, "r", encoding=encoding) as xml: + xml_contents = xml.readlines() + if "hmdb" in xml_contents[1]: + return source + + xml_contents.insert(1, " \n") + + with io.open(source, "w", encoding=encoding) as xml: + xml_contents = "".join(xml_contents) + xml.write(xml_contents) + xml.write("") + + return source + + +def parse_xml(source, encoding="utf8", reformat=False): + """ + Parses the contents of HMDB xml files to to extract information for the generation of substructures. + + :param source: Source file destination. + + :param encoding: Encoding of source file. + + :param reformat: Whether to apply :py:meth:`metaboblend.databases.reformat_xml` to the XML file. Is required for + XML files recording single metabolites. + + * **True** Add a `` header to the XML file before parsing. + + * **False** Parse the XML file as it is (recommended if header is present). + + :return: The XML file converted to a dictionary. + """ + + if reformat: + reformat_xml(source, encoding) + + with io.open(source, "r", encoding=encoding) as inp: + record_out = OrderedDict() + + inp.readline() + inp.readline() + + xml_record = "" + path = [] + + for line in inp: + xml_record += line + if line == "\n" or line == "\n": + + if sys.version_info[0] == 3: + inp = io.StringIO(xml_record) + else: + inp = io.BytesIO(xml_record.encode('utf-8').strip()) + + for event, elem in ElementTree.iterparse(inp, events=("start", "end")): + if event == 'end': + path.pop() + + if event == 'start': + path.append(elem.tag) + if elem.text is not None: + if elem.text.replace(" ", "") != "\n": + + path_elem = ".".join(map(str, path[1:])) + if path_elem in record_out: + if type(record_out[path_elem]) != list: + record_out[path_elem] = [record_out[path_elem]] + record_out[path_elem].append(elem.text) + else: + record_out[path_elem] = elem.text + + xml_record = "" + yield record_out + record_out = OrderedDict() diff --git a/metaboblend/results.py b/metaboblend/results.py new file mode 100644 index 0000000..898443f --- /dev/null +++ b/metaboblend/results.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + +import os +import csv +import sqlite3 + + +class ResultsDb: + """ + Methods for interacting with the SQLITE3 results database, as created by + :py:meth:`metaboblend.build_structures.annotate_msn`. + + :param path_results: Directory to which results will be written. + """ + + def __init__(self, path_results, msn=True): + """Constructor method.""" + + self.path_results = path_results + self.path_results_db = os.path.join(self.path_results, "metaboblend_results.sqlite") + self.msn = msn + + self.conn = None + self.cursor = None + + self.substructure_combo_id = 0 + + def connect(self): + """ Connects to the results database. """ + + self.conn = sqlite3.connect(self.path_results_db) + self.cursor = self.conn.cursor() + + def create_results_db(self): + """ Generates a new results database. """ + + if os.path.exists(self.path_results_db): + os.remove(self.path_results_db) + + self.connect() + + self.cursor.execute("""CREATE TABLE queries ( + ms_id_num INTEGER PRIMARY KEY, + ms_id TEXT, + exact_mass NUMERIC, + C INTEGER, + H INTEGER, + N INTEGER, + O INTEGER, + P INTEGER, + S INTEGER, + ppm INTEGER, + ha_min INTEGER, + ha_max INTEGER, + max_atoms_available INTEGER, + max_degree INTEGER, + max_n_substructures INTEGER, + hydrogenation_allowance INTEGER, + isomeric_smiles INTEGER)""") + + if self.msn: + self.cursor.execute("""CREATE TABLE spectra ( + ms_id_num INTEGER, + fragment_id INTEGER, + neutral_mass NUMERIC, + PRIMARY KEY (ms_id_num, fragment_id))""") + + self.cursor.execute("""CREATE TABLE structures ( + ms_id_num INTEGER, + structure_smiles TEXT, + frequency INTEGER, + PRIMARY KEY (ms_id_num, structure_smiles))""") + + self.cursor.execute("""CREATE TABLE substructures ( + substructure_combo_id INTEGER, + substructure_position_id INTEGER, + ms_id_num INTEGER, + structure_smiles TEXT, + fragment_id INTEGER, + substructure_smiles TEXT, + bde INTEGER, + PRIMARY KEY (substructure_combo_id, substructure_position_id))""") + + self.cursor.execute("""CREATE TABLE results ( + ms_id_num INTEGER, + fragment_id INTEGER, + structure_smiles TEXT, + bde INTEGER, + PRIMARY KEY(ms_id_num, fragment_id, structure_smiles))""") + + self.conn.commit() + + def add_ms(self, msn_data, ms_id, ms_id_num, parameters): + """ + Add entries to the `queries` and `spectra` tables. + + :param msn_data: Dictionary in the form + `msn_data[id] = {mf: [C, H, N, O, P, S], exact_mass: float, fragment_masses: []}`. id represents a unique + identifier for a given spectral tree or fragmentation spectrum, mf is a list of integers referring to the + molecular formula of the structure of interest, exact_mass is the mass of this molecular formula to >=4d.p. + and fragment_masses are neutral fragment masses generated by this structure used to inform candidate + scoring. See :py:meth:`metaboblend.build_structures.annotate_msn`. + + :param ms_id: Unique identifier for the annotation of a single metabolite. + + :param ms_id_num: Unique numeric identifier for the annotation of a single metaoblite. + + :param parameters: List of parameters, in the form: [ppm, ha_min, ha_max, max_atoms_available, max_degree, + max_n_substructures, hydrogenation_allowance, isomeric_smiles]. See + :py:meth:`metaboblend.build_structures.annotate_msn`. + """ + + for i, parameter in enumerate(parameters): + if parameter is None: + parameters[i] = "NULL" + elif isinstance(parameter, bool): + parameters[i] = int(parameter) + + self.cursor.execute("""INSERT INTO queries ( + ms_id, + ms_id_num, + exact_mass, + C, H, N, O, P, S, + ppm, + ha_min, + ha_max, + max_atoms_available, + max_degree, + max_n_substructures, + hydrogenation_allowance, + isomeric_smiles + ) VALUES ('{}', {}, {}, '{}', '{}', '{}', '{}', '{}', '{}', {})""".format( + ms_id, + ms_id_num, + msn_data[ms_id]["exact_mass"], + msn_data[ms_id]["mf"][0], msn_data[ms_id]["mf"][1], + msn_data[ms_id]["mf"][2], msn_data[ms_id]["mf"][3], + msn_data[ms_id]["mf"][4], msn_data[ms_id]["mf"][5], + ", ".join([str(p) for p in parameters]) + )) + + self.conn.commit() + + def add_results(self, ms_id_num, smi_dict, fragment_mass=None, fragment_id=None, retain_substructures=False): + """ + Record which smiles were generated for a given fragment mass. + + :param ms_id_num: Unique identifier for the annotation of a single metabolite. + + :param smi_dict: The fragment and substructure smiles generated by the annotation of a single peak for a single + metabolite. + + :param fragment_mass: The neutral fragment mass that has been annotated. + + :param fragment_id: The unique identifier for the fragment mass that has been annotated. + + :param retain_substructures: If True, record substructures in the results DB. + """ + + if self.msn: + self.cursor.execute("""INSERT OR IGNORE INTO spectra ( + ms_id_num, + fragment_id, + neutral_mass + ) VALUES ('{}', {}, {})""".format( + ms_id_num, + fragment_id, + fragment_mass + )) + else: + fragment_id = "NULL" + + for structure_smiles in smi_dict.keys(): + + self.cursor.execute("""INSERT OR IGNORE INTO results ( + ms_id_num, + fragment_id, + structure_smiles, + bde + ) VALUES ({}, {}, '{}', {})""".format( + ms_id_num, + fragment_id, + structure_smiles, + min(smi_dict[structure_smiles]["bdes"]) + )) + + if retain_substructures: + for i in range(len(smi_dict[structure_smiles]["substructures"])): # for each combination + + for j, substructure in enumerate(smi_dict[structure_smiles]["substructures"][i]): + self.cursor.execute("""INSERT INTO substructures ( + substructure_combo_id, + substructure_position_id, + ms_id_num, + fragment_id, + structure_smiles, + substructure_smiles, + bde + ) VALUES ({}, {}, {}, {}, '{}', '{}', {})""".format( + self.substructure_combo_id, + j, + ms_id_num, + fragment_id, + structure_smiles, + substructure, + smi_dict[structure_smiles]["bdes"][i] + )) + + self.substructure_combo_id += 1 + + self.conn.commit() + + def calculate_frequencies(self, ms_id_num): + """ + Calculates structure frequencies in the SQLite DB. + + :param ms_id_num: Unique identifier for the annotation of a single metabolite. + """ + + self.cursor.execute("""INSERT INTO structures (ms_id_num, structure_smiles, frequency) + SELECT ms_id_num, structure_smiles, COUNT(*) + FROM results + WHERE ms_id_num = {} + GROUP BY structure_smiles""".format(ms_id_num)) + + def get_structures(self, ms_id_num): + """ + Gets smiles of generated structures. In the case of the MSn annotation workflow, also gets structure + frequencies. + + :param ms_id_num: Unique identifier for the annotation of a single metabolite. + + :return: In the case of simple structure generation, returns a set of smiles strings for output structures. + For the MSn annotation workflow, returns a dictionary with smiles as keys and the number of peaks for which + the smiles were generated as values. + """ + + if self.msn: + msn_str = ", frequency" + else: + msn_str = "" + + self.cursor.execute("""SELECT structure_smiles{} FROM structures + WHERE ms_id_num = {} + """.format(msn_str, ms_id_num)) + + if self.msn: + return [t for t in self.cursor.fetchall()] + else: + return [item for t in self.cursor.fetchall() for item in t] + + def generate_csv_output(self): + """ Generate CSV file output for i) queries and tool parameters and ii) structures generated. """ + + with open(os.path.join(self.path_results, "metaboblend_queries.csv"), "w", newline="") as results_file, \ + open(os.path.join(self.path_results, "metaboblend_structures.csv"), "w", newline="") as ms_file: + + results_writer = csv.writer(results_file, delimiter=",") + ms_writer = csv.writer(ms_file, delimiter=",") + + results_writer.writerow(["ms_id", "exact_mass", "C", "H", "N", "O", "P", "S", "ppm", "ha_min", "ha_max", + "max_atoms_available", "max_degree", "max_n_substructures", + "hydrogenation_allowance", "isomeric_smiles"]) + + self.cursor.execute("SELECT * FROM queries") + + for query in self.cursor.fetchall(): + results_writer.writerow(query) + + ms_writer.writerow(["ms_id", "smiles", "frequency", "exact_mass", "C", "H", "N", "O", "P", "S"]) + + self.cursor.execute("SELECT * FROM structures") + + for structure in self.cursor.fetchall(): + ms_writer.writerow(structure) + + def close(self): + """ Close the connection to the SQLITE3 database. """ + + self.conn.close() diff --git a/tests/test_build_structures.py b/tests/test_build_structures.py index e74f81d..32a67c7 100644 --- a/tests/test_build_structures.py +++ b/tests/test_build_structures.py @@ -202,7 +202,7 @@ def test_generate_structures(self): # tests vs build ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], record_dict["O"], record_dict["P"], record_dict["S"]], "exact_mass": record_dict["exact_mass"], - "prescribed_masses": fragments[i]}} + "prescribed_mass": fragments[i]}} # test prescribed building returned_smis = list( @@ -280,7 +280,7 @@ def test_annotate_msn(self): # tests vs build_msn ms_data = {record_dict["HMDB_ID"]: {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], record_dict["O"], record_dict["P"], record_dict["S"]], "exact_mass": record_dict["exact_mass"], - "fragment_masses": fragments}} + "neutral_fragment_masses": fragments}} # test standard building returned_smis = list(annotate_msn( @@ -307,7 +307,7 @@ def test_annotate_msn(self): # tests vs build_msn ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], record_dict["O"], record_dict["P"], record_dict["S"]], "exact_mass": record_dict["exact_mass"], - "fragment_masses": fragments} + "neutral_fragment_masses": fragments} os.mkdir(self.to_test_results("annotate_multi")) @@ -338,7 +338,7 @@ def test_results_db(self): ms_data[record_dict["HMDB_ID"]] = {"mf": [record_dict["C"], record_dict["H"], record_dict["N"], record_dict["O"], record_dict["P"], record_dict["S"]], "exact_mass": record_dict["exact_mass"], - "fragment_masses": fragments} + "neutral_fragment_masses": fragments} os.mkdir(self.to_test_results("test_results_db")) diff --git a/tests/test_data/massbank_msp.txt b/tests/test_data/massbank_msp.txt new file mode 100644 index 0000000..cdfeb00 --- /dev/null +++ b/tests/test_data/massbank_msp.txt @@ -0,0 +1,87 @@ +ACCESSION: UO000002 +RECORD_TITLE: 2,3-di-O-Phytanyl-sn-glycerol-1-phosphoserine; EI-B; MS +DATE: 2016.01.19 (Created 2009.05.29, modified 2011.05.06) +AUTHORS: Hiroyuki Morii, Department of Chemistry, University of Occupational and Environmental Health +LICENSE: CC BY-SA +PUBLICATION: Morii, H., Nishihara, M., Ohga, M., and Koga, Y. 1986. A diphytanyl ether analog of phosphatidylserine from a methanogenic bacterium, Methanobrevibacter arboriphilus. J Lipid Res. 27: 724-730. +COMMENT: Ammonium salt of the compound was analyzed +COMMENT: [Analytical] Ionizing Curr 300 uA, Chamber Temp 250 C, Accel Volt 3KV, Ion Multi 1.0 KV, +CH$NAME: 2,3-di-O-Phytanyl-sn-glycerol-1-phosphoserine +CH$NAME: archaetidylserine +CH$COMPOUND_CLASS: Glycerophospholipids; Glycerophosphoserines; Dialkylglycerophosphoserines +CH$FORMULA: C46H94NO8P +CH$EXACT_MASS: 819.67171 +CH$SMILES: C(CCC(C)C)C(C)CCCC(CCCC(CCOCC(OCCC(CCCC(C)CCCC(C)CCCC(C)C)C)COP(O)(=O)OCC(C(O)=O)N)C)C +CH$IUPAC: InChI=1S/C46H94NO8P/c1-36(2)17-11-19-38(5)21-13-23-40(7)25-15-27-42(9)29-31-52-33-44(34-54-56(50,51)55-35-45(47)46(48)49)53-32-30-43(10)28-16-26-41(8)24-14-22-39(6)20-12-18-37(3)4/h36-45H,11-35,47H2,1-10H3,(H,48,49)(H,50,51)/t38-,39-,40-,41-,42-,43-,44-,45-/m1/s1 +CH$LINK: CAS 105662-26-8 +CH$LINK: LIPIDBANK EEL3026 +AC$INSTRUMENT: JMS DX-300/JMS-3500 data system, Japan Electron Optics Laboratory, Japan +AC$INSTRUMENT_TYPE: EI-B +AC$MASS_SPECTROMETRY: MS_TYPE MS +AC$MASS_SPECTROMETRY: ION_MODE POSITIVE +AC$MASS_SPECTROMETRY: IONIZATION_POTENTIAL 30 eV +AC$MASS_SPECTROMETRY: SCANNING 5 Sec +AC$MASS_SPECTROMETRY: SOURCE_TEMPERATURE 320 C +MS$FOCUSED_ION: ION_TYPE [M]+* +PK$SPLASH: splash10-05gi-9611001000-5e7663b41cf47681770e +PK$NUM_PEAK: 58 +PK$PEAK: m/z int. rel.int. + 36.0 48.935 69 + 43.0 155.642 221 + 55.0 174.853 248 + 56.0 241.397 343 + 57.0 685.069 973 + 69.0 429.724 610 + 70.0 442.816 629 + 71.0 703.562 999 + 74.0 153.368 218 + 81.0 183.718 261 + 82.0 116.090 165 + 83.0 432.501 614 + 84.0 189.394 269 + 85.0 524.513 745 + 95.0 102.128 145 + 96.0 179.720 255 + 97.0 449.439 638 + 98.0 118.287 168 + 99.0 376.928 535 + 111.0 429.907 610 + 112.0 168.627 239 + 113.0 308.186 438 + 123.0 298.191 423 + 124.0 233.188 331 + 125.0 504.097 716 + 126.0 362.310 514 + 127.0 300.450 427 + 139.0 123.338 175 + 140.0 233.340 331 + 141.0 235.767 335 + 153.0 107.469 153 + 155.0 163.241 232 + 169.0 130.189 185 + 182.0 94.773 135 + 183.0 160.234 228 + 196.0 132.555 188 + 197.0 163.622 232 + 211.0 75.654 107 + 278.0 326.649 464 + 279.0 220.386 313 + 280.0 227.649 323 + 281.0 143.572 204 + 296.0 158.358 225 + 297.0 60.502 86 + 309.0 35.370 50 + 325.0 279.819 397 + 326.0 71.000 101 + 340.0 133.760 190 + 341.0 60.486 86 + 343.0 243.579 346 + 344.0 60.028 85 + 354.0 51.468 73 + 373.0 132.555 188 + 374.0 42.069 60 + 383.0 63.767 91 + 634.0 450.446 640 + 635.0 212.497 302 + 636.0 49.622 70 +// diff --git a/tests/test_data/mona_msp.msp b/tests/test_data/mona_msp.msp new file mode 100644 index 0000000..b1f6d21 --- /dev/null +++ b/tests/test_data/mona_msp.msp @@ -0,0 +1,580 @@ +Name: Sulfaclozine +Synon: 4-amino-N-(6-chloropyrazin-2-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100601 +InChIKey: QKLPUVXBJHRFQZ-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 285.0208 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: Ramp 21.1-31.6 eV +Formula: C10H9ClN4O2S +MW: 284 +ExactMass: 284.013474208 +Comments: "accession=AU100601" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=284.0135" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=Ramp 21.1-31.6 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.6 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=285.0208" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.17469602228006656" "mass error=4.9792000027082395E-5" "SMILES=c1cc(ccc1N)S(=O)(=O)Nc2cncc(n2)Cl" "cas=102-65-8" "pubchem cid=66890" "chemspider=60252" "InChI=InChI=1S/C10H9ClN4O2S/c11-9-5-13-6-10(14-9)15-18(16,17)8-3-1-7(12)2-4-8/h1-6H,12H2,(H,14,15)" "InChIKey=QKLPUVXBJHRFQZ-UHFFFAOYSA-N" "molecular formula=C10H9ClN4O2S" "total exact mass=284.013474208" "SMILES=C=1C=C(C=CC1N)S(N=C2C=NC=C(Cl)N2)(=O)=O" +Num Peaks: 27 +53.0389 0.594951 +54.0333 0.566811 +55.0178 0.522592 +60.0552 0.542692 +65.0382 3.822962 +66.0423 0.506512 +68.049 7.963499 +78.0333 0.727609 +79.0177 1.057244 +92.0498 7.702203 +93.0532 0.731629 +96.0443 0.623091 +108.0457 12.172375 +109.0483 1.181862 +110.0609 4.904325 +120.0562 3.095353 +130.0172 5.656054 +132.0138 1.515517 +156.0118 100.000000 +157.015 8.884065 +158.008 3.891301 +174.0228 0.751729 +184.0757 0.619071 +191.9647 0.590931 +219.0438 0.723589 +285.0221 3.694324 +287.0184 0.840167 + + +Name: Sulfachlorpyridazine +Synon: 4-amino-N-(6-chloropyridazin-3-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100701 +InChIKey: XOXHILFPRYWFOD-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 285.0208 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: Ramp 21.1-31.6 eV +Formula: C10H9ClN4O2S +MW: 284 +ExactMass: 284.013474208 +Comments: "accession=AU100701" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=284.0135" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=Ramp 21.1-31.6 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.6 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=285.0208" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.17469602228006656" "mass error=4.9792000027082395E-5" "SMILES=c1cc(ccc1N)S(=O)(=O)Nc2ccc(nn2)Cl" "cas=80-32-0" "pubchem cid=6634" "chemspider=6382" "InChI=InChI=1S/C10H9ClN4O2S/c11-9-5-6-10(14-13-9)15-18(16,17)8-3-1-7(12)2-4-8/h1-6H,12H2,(H,14,15)" "InChIKey=XOXHILFPRYWFOD-UHFFFAOYSA-N" "molecular formula=C10H9ClN4O2S" "total exact mass=284.013474208" "SMILES=C=1C=C(C=CC1N)S(NC=2C=CC(Cl)=NN2)(=O)=O" +Num Peaks: 27 +53.0389 0.594951 +54.0333 0.566811 +55.0178 0.522592 +60.0552 0.542692 +65.0382 3.822962 +66.0423 0.506512 +68.049 7.963499 +78.0333 0.727609 +79.0177 1.057244 +92.0498 7.702203 +93.0532 0.731629 +96.0443 0.623091 +108.0457 12.172375 +109.0483 1.181862 +110.0609 4.904325 +120.0562 3.095353 +130.0172 5.656054 +132.0138 1.515517 +156.0118 100.000000 +157.015 8.884065 +158.008 3.891301 +174.0228 0.751729 +184.0757 0.619071 +191.9647 0.590931 +219.0438 0.723589 +285.0221 3.694324 +287.0184 0.840167 + + +Name: Sulfadimidine +Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100801 +InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 279.091 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: Ramp 20.8-31.3 eV +Formula: C12H14N4O2S +MW: 278 +ExactMass: 278.08374668799996 +Comments: "accession=AU100801" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=Ramp 20.8-31.3 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.4 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248146496051" "mass error=-2.2687999944537296E-5" "SMILES=Cc1cc(nc(n1)NS(=O)(=O)c2ccc(cc2)N)C" "cas=57-68-1" "kegg=C19530" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.08374668799996" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 46 +53.0379 0.894101 +54.0335 0.661867 +55.0176 0.598003 +65.0379 8.717487 +68.0491 13.013818 +69.0329 1.640153 +78.0334 1.477589 +79.0178 2.261379 +80.0489 1.431143 +81.0444 1.950766 +82.0284 0.606712 +92.0499 30.585230 +93.0558 2.844868 +94.0647 1.686600 +95.0608 3.027752 +96.0443 1.300511 +108.0461 33.946818 +109.0497 2.360079 +110.0616 6.107757 +111.0651 0.519624 +120.0565 1.962378 +122.0716 6.078727 +123.0794 2.246865 +124.0872 71.211681 +125.0905 6.398049 +126.0663 17.911054 +127.0697 0.595100 +156.0117 82.855318 +157.0148 5.739085 +158.0072 1.544357 +174.0224 1.106015 +186.0334 11.263353 +187.0368 0.775081 +188.0128 1.637250 +188.0291 0.534138 +204.0445 100.000000 +205.0473 6.972829 +206.0406 3.358686 +213.1141 18.259405 +214.1167 2.241059 +215.0927 3.071296 +215.1291 1.320831 +279.0925 61.483976 +280.0953 8.438806 +281.0725 7.837901 +282.0742 1.222132 + + +Name: Sulfamethazine +Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100802 +InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 279.091 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 20 eV +Formula: C12H14N4O2S +MW: 278 +ExactMass: 278.08374668799996 +Comments: "accession=AU100802" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=20 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.1 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248146496051" "mass error=-2.2687999944537296E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.08374668799996" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 16 +122.0703 0.766124 +124.0861 36.693459 +125.0892 1.930893 +149.0227 0.828453 +156.0104 53.249536 +157.0129 2.999571 +158.0061 1.778967 +174.0209 0.627183 +186.0321 22.621444 +187.0346 1.719235 +188.0285 0.646661 +204.0431 100.000000 +213.1128 8.749399 +214.1159 1.407591 +215.1281 0.658348 +279.0909 80.894937 + + +Name: Sulfamethazine +Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100803 +InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 279.091 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 30 eV +Formula: C12H14N4O2S +MW: 278 +ExactMass: 278.083746688 +Comments: "accession=AU100803" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=30 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.2 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248166863394" "mass error=-2.2688000001380715E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.083746688" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 17 +108.0441 1.285794 +122.0704 6.630847 +123.0781 2.170942 +124.0861 100.000000 +125.0889 6.093546 +149.0221 1.388285 +156.0106 50.043481 +158.0064 1.615007 +186.0323 15.118951 +187.0355 1.323064 +196.0858 1.220573 +204.0429 70.964035 +205.0455 4.931983 +213.1123 22.610100 +214.1155 3.003292 +215.1283 0.804398 +279.0903 3.580968 + + +Name: Sulfamethazine +Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100804 +InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 279.091 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 40 eV +Formula: C12H14N4O2S +MW: 278 +ExactMass: 278.083746688 +Comments: "accession=AU100804" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=40 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.1 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248166863394" "mass error=-2.2688000001380715E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.083746688" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 22 +108.0445 1.153673 +122.0702 5.323878 +123.0772 2.202467 +124.0862 100.000000 +125.089 6.847126 +134.0701 0.714179 +149.0224 1.747990 +154.0624 0.644259 +155.0685 0.624282 +156.0104 10.373071 +157.0126 0.933926 +172.0852 0.564351 +186.0324 3.845578 +196.0852 5.209010 +197.0903 1.378415 +198.0888 2.362283 +204.0427 15.422264 +205.0463 0.869001 +206.0375 0.759127 +212.1036 0.659242 +213.1121 18.109174 +214.1152 2.577036 + + +Name: Sulfamethazine +Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100805 +InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 279.091 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 50 eV +Formula: C12H14N4O2S +MW: 278 +ExactMass: 278.083746688 +Comments: "accession=AU100805" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=50 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.2 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248166863394" "mass error=-2.2688000001380715E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.083746688" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 24 +108.0453 1.770916 +122.0703 2.803951 +123.078 2.792598 +124.0859 100.000000 +125.0891 7.901010 +149.0231 1.623340 +154.0639 2.111477 +155.0605 2.463390 +155.0714 2.690430 +156.01 2.713134 +169.0745 1.475763 +171.0781 1.555228 +172.0869 1.271427 +181.0634 0.930866 +186.1022 1.033034 +195.0786 1.555228 +196.0859 7.628562 +197.0856 3.871041 +198.0886 5.903054 +199.0904 0.998978 +204.0438 2.622318 +212.1048 2.327165 +213.1122 9.342718 +214.1153 1.725508 + + +Name: Sulfamethazine +Synon: 4-amino-N-(4,6-dimethylpyrimidin-2-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100806 +InChIKey: ASWVTGNCAZCNNR-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 279.091 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 10 eV +Formula: C12H14N4O2S +MW: 278 +ExactMass: 278.08374668799996 +Comments: "accession=AU100806" "author=Nikiforos Alygizakis, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=278.0837467" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=10 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.2 min" "solvent a=90:10 water:methanol with 0.01% formic acid and 5mM ammonium formate" "solvent b=methanol with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=279.091" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08129248146496051" "mass error=-2.2687999944537296E-5" "SMILES=CC1=CC(C)=NC(NS(=O)(=O)C2=CC=C(N)C=C2)=N1" "cas=57-68-1" "chebi=102265" "kegg=D02436" "pubchem cid=5327" "chemspider=5136" "InChI=InChI=1S/C12H14N4O2S/c1-8-7-9(2)15-12(14-8)16-19(17,18)11-5-3-10(13)4-6-11/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ASWVTGNCAZCNNR-UHFFFAOYSA-N" "molecular formula=C12H14N4O2S" "total exact mass=278.08374668799996" "SMILES=CC1=CC(C)=NC(=N1)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 4 +124.086 0.740586 +156.0098 1.123942 +186.0319 0.831793 +279.0908 100.000000 + + +Name: Sulfadimethoxine +Synon: 4-amino-n-(2,6-dimethoxypyrimidin-4-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100902 +InChIKey: ZZORFUFYDOWNEF-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 311.0809 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 20 eV +Formula: C12H14N4O4S +MW: 310 +ExactMass: 310.07357592799997 +Comments: "accession=AU100902" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0735759" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=20 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.6 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214911901536" "mass error=4.8072000026877504E-5" "SMILES=COc1cc(nc(n1)OC)NS(=O)(=O)c2ccc(cc2)N" "cas=122-11-2" "chebi=32161" "pubchem=5323" "chemspider=5132" "InChI=InChI=1S/C12H14N4O4S/c1-19-11-7-10(14-12(15-11)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ZZORFUFYDOWNEF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.07357592799997" "SMILES=COC=1C=C(N=C(N1)OC)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 15 +140.0447 6.249276 +141.0515 0.699085 +154.0604 5.781932 +155.0683 3.398864 +156.0107 100.000000 +156.0763 16.893901 +157.0134 4.171334 +157.0794 0.857441 +158.0069 2.371480 +218.0242 0.965586 +245.1032 9.010853 +246.1061 0.834267 +311.0811 75.335059 +312.0835 7.145340 +313.0796 2.301958 + + +Name: Sulfadimethoxine +Synon: 4-amino-n-(2,6-dimethoxypyrimidin-4-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100903 +InChIKey: ZZORFUFYDOWNEF-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 311.0809 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 30 eV +Formula: C12H14N4O4S +MW: 310 +ExactMass: 310.073575928 +Comments: "accession=AU100903" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0735759" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=30 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.6 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214893628664" "mass error=4.8071999970034085E-5" "SMILES=COc1cc(nc(n1)OC)NS(=O)(=O)c2ccc(cc2)N" "cas=122-11-2" "chebi=32161" "pubchem=5323" "chemspider=5132" "InChI=InChI=1S/C12H14N4O4S/c1-19-11-7-10(14-12(15-11)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ZZORFUFYDOWNEF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.073575928" "SMILES=COC=1C=C(N=C(N1)OC)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 21 +108.0448 1.310092 +124.0204 1.354502 +126.0659 3.563895 +127.0504 0.843788 +138.0294 1.576552 +141.0517 10.458532 +154.0604 60.575108 +155.0672 5.484623 +156.0105 100.000000 +156.0762 63.495059 +157.0131 4.540913 +157.0798 3.452870 +158.0071 2.320417 +201.0772 2.720107 +212.069 3.896969 +218.0235 0.843788 +230.0808 9.270567 +231.0843 1.232375 +245.1039 10.447430 +246.107 1.176862 +311.0829 3.819252 + + +Name: Sulfadimethoxine +Synon: 4-amino-n-(2,6-dimethoxypyrimidin-4-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100904 +InChIKey: ZZORFUFYDOWNEF-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 311.0809 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 40 eV +Formula: C12H14N4O4S +MW: 310 +ExactMass: 310.073575928 +Comments: "accession=AU100904" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0735759" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=40 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.7 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214893628664" "mass error=4.8071999970034085E-5" "SMILES=COc1cc(nc(n1)OC)NS(=O)(=O)c2ccc(cc2)N" "cas=122-11-2" "chebi=32161" "pubchem=5323" "chemspider=5132" "InChI=InChI=1S/C12H14N4O4S/c1-19-11-7-10(14-12(15-11)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ZZORFUFYDOWNEF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.073575928" "SMILES=COC=1C=C(N=C(N1)OC)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 27 +112.0515 2.118270 +123.0436 2.184466 +124.0205 1.897617 +124.0508 2.162401 +126.0666 5.803177 +127.0502 2.030009 +132.0558 1.963813 +138.0295 4.898500 +140.045 77.780229 +141.0524 38.989409 +142.058 2.449250 +154.0604 100.000000 +155.0634 5.383936 +156.0104 20.101500 +156.0407 4.236540 +156.0761 54.744042 +157.0639 1.809356 +160.049 1.985878 +178.0597 3.420124 +201.077 8.274492 +202.0789 1.787290 +212.0697 15.114740 +213.0728 2.581642 +229.0713 2.206531 +230.0797 6.421006 +231.0852 1.919682 +245.1026 1.919682 + + +Name: Sulfadimethoxine +Synon: 4-amino-n-(2,6-dimethoxypyrimidin-4-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU100905 +InChIKey: ZZORFUFYDOWNEF-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 311.0809 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 50 eV +Formula: C12H14N4O4S +MW: 310 +ExactMass: 310.07357592799997 +Comments: "accession=AU100905" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0735759" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=50 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.7 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214911901536" "mass error=4.8072000026877504E-5" "SMILES=COc1cc(nc(n1)OC)NS(=O)(=O)c2ccc(cc2)N" "cas=122-11-2" "chebi=32161" "pubchem=5323" "chemspider=5132" "InChI=InChI=1S/C12H14N4O4S/c1-19-11-7-10(14-12(15-11)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=ZZORFUFYDOWNEF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.07357592799997" "SMILES=COC=1C=C(N=C(N1)OC)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 22 +112.051 5.243790 +123.0427 7.773689 +124.0502 6.439742 +126.0287 5.841766 +126.0666 6.255750 +127.0491 3.955842 +132.0559 9.521619 +133.0628 5.105796 +138.0293 10.579577 +140.045 45.768169 +141.0521 46.780129 +142.0539 3.817847 +154.0606 100.000000 +156.0102 3.679853 +156.0405 5.243790 +156.0769 17.157314 +157.0629 5.887764 +160.0507 3.909844 +178.0613 9.429623 +184.0741 4.737810 +201.0768 9.015639 +212.0705 7.589696 + + +Name: Sulfadoxine +Synon: 4-amino-N-(5,6-dimethoxypyrimidin-4-yl)benzenesulfonamide +SYNON: $:00in-source +DB#: AU101001 +InChIKey: PJSFRIWCGOHTNF-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 311.0809 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: Ramp 21.8-32.7 eV +Formula: C12H14N4O4S +MW: 310 +ExactMass: 310.07357592799997 +Comments: "accession=AU101001" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=310.0736" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=Ramp 21.8-32.7 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=4.8 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=311.0809" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.15453214911901536" "mass error=4.8072000026877504E-5" "SMILES=COc1c(ncnc1OC)NS(=O)(=O)c2ccc(cc2)N" "cas=2447-57-6" "kegg=C07630" "pubchem cid=17134" "chemspider=16218" "InChI=InChI=1S/C12H14N4O4S/c1-19-10-11(14-7-15-12(10)20-2)16-21(17,18)9-5-3-8(13)4-6-9/h3-7H,13H2,1-2H3,(H,14,15,16)" "InChIKey=PJSFRIWCGOHTNF-UHFFFAOYSA-N" "molecular formula=C12H14N4O4S" "total exact mass=310.07357592799997" "SMILES=COC1=C(N=CN=C1OC)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 42 +53.0386 0.535490 +54.0339 0.505437 +65.0381 7.755041 +68.0491 10.088247 +69.0329 1.049123 +78.0332 1.038195 +79.0179 1.721217 +80.0363 0.707612 +80.0493 1.446642 +92.0498 46.272062 +93.0559 3.808535 +96.0447 1.331894 +108.0463 57.395771 +109.049 3.816731 +109.0643 0.531392 +110.0614 7.111633 +113.0359 0.703513 +120.0568 1.860554 +124.0215 1.529971 +124.0512 0.572373 +126.0665 2.939730 +138.0301 0.707612 +140.0457 34.351948 +141.0528 5.744222 +154.0615 32.562428 +155.0682 7.798754 +156.0118 100.000000 +156.0771 40.377575 +157.0147 7.961314 +157.0796 2.486203 +158.0078 3.766188 +201.0773 1.349653 +212.0697 2.576362 +213.0752 0.527294 +218.0236 1.945249 +230.0808 5.531119 +231.085 0.811431 +245.1045 18.128791 +246.1073 2.479373 +311.0829 49.986340 +312.0854 8.491339 +313.0812 2.222556 + + +Name: Sulfadiazine +Synon: 4-amino-n-pyrimidin-2-ylbenzenesulfonamide +SYNON: $:00in-source +DB#: AU101101 +InChIKey: SEEPANYCNGTZFQ-UHFFFAOYSA-N +Precursor_type: [M+H]+ +Spectrum_type: MS2 +PrecursorMZ: 251.0597 +Instrument_type: LC-ESI-QTOF +Instrument: Bruker maXis Impact +Ion_mode: P +Collision_energy: 10 eV +Formula: C10H10N4O2S +MW: 250 +ExactMass: 250.05244656 +Comments: "accession=AU101101" "author=Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens" "license=CC BY-SA" "copyright=Copyright (C) 2015 Department of Chemistry, University of Athens" "exact mass=250.0524466" "instrument=Bruker maXis Impact" "instrument type=LC-ESI-QTOF" "ms level=MS2" "ionization=ESI" "fragmentation mode=CID" "collision energy=10 eV" "resolution=35000" "column=Acclaim RSLC C18 2.2um, 2.1x100mm, Thermo" "flow gradient=99/1 at 0-1 min, 61/39 at 3 min, 0.1/99.9 at 14-16 min, 99/1 at 16.1-20 min" "flow rate=200 uL/min at 0-3 min, 400 uL/min at 14 min, 480 uL/min at 16-19 min, 200 uL/min at 19.1-20 min" "retention time=3.3 min" "solvent a=water with 0.01% formic acid and 5mM ammonium formate" "solvent b=90:10 methanol:water with 0.01% formic acid and 5mM ammonium formate" "precursor m/z=251.0597" "precursor type=[M+H]+" "ionization mode=positive" "mass accuracy=0.08985910518808851" "mass error=-2.2559999990789947E-5" "SMILES=c1cnc(nc1)NS(=O)(=O)c2ccc(cc2)N" "cas=141582-64-1" "chebi=9328" "kegg=C07658" "pubchem=5215" "chemspider=5026" "InChI=InChI=1S/C10H10N4O2S/c11-8-2-4-9(5-3-8)17(15,16)14-10-12-6-1-7-13-10/h1-7H,11H2,(H,12,13,14)" "InChIKey=SEEPANYCNGTZFQ-UHFFFAOYSA-N" "molecular formula=C10H10N4O2S" "total exact mass=250.05244656" "SMILES=C1=CN=C(N=C1)NS(C2=CC=C(C=C2)N)(=O)=O" +Num Peaks: 6 +156.0106 9.361897 +174.0199 0.724251 +176.012 0.693756 +251.0596 100.000000 +252.0616 7.867653 +253.0565 2.729283 \ No newline at end of file diff --git a/tests/test_databases.py b/tests/test_databases.py index daae038..20c7ff7 100644 --- a/tests/test_databases.py +++ b/tests/test_databases.py @@ -25,6 +25,7 @@ import shutil import pickle from metaboblend.databases import * +from metaboblend.parse import reformat_xml class DatabasesTestCase(unittest.TestCase): diff --git a/tests/test_isomorphism_database.py b/tests/test_isomorphism_database.py index 96ff705..6ad11a4 100644 --- a/tests/test_isomorphism_database.py +++ b/tests/test_isomorphism_database.py @@ -21,6 +21,7 @@ import os +import sys import unittest import shutil import tempfile @@ -45,46 +46,50 @@ def setUpClass(cls): shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"), cls.to_test_results("test_data")) + def test_create_connectivity_database(self): + pkg_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + if sys.platform == "win32" or sys.platform == "win64": # TODO: add RI as dependency - cls.path_ri = os.path.join(pkg_path, "tools", "RI_win", "RI3.6-release", "ri36") + self.path_ri = os.path.join(pkg_path, "tools", "RI_win", "RI3.6-release", "ri36") - elif sys.platform == "darwin": - cls.path_ri = os.path.join(pkg_path, "tools", "RI_mac", "RI3.6-release", "ri36") + else: - elif sys.platform == "linux2": - if "bb" in "socket.gethostname": - cls.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36") - else: - cls.path_ri = os.path.join(pkg_path, "tools", "RI_bb", "RI3.6-release", "ri36") + if sys.platform == "darwin": + self.path_ri = os.path.join(pkg_path, "tools", "RI_mac", "RI3.6-release", "ri36") - elif sys.platform == "linux": - cls.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36") + elif sys.platform == "linux2": + if "bb" in "socket.gethostname": + self.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36") + else: + self.path_ri = os.path.join(pkg_path, "tools", "RI_bb", "RI3.6-release", "ri36") - create_connectivity_database(cls.to_test_results("connectivity.sqlite"), - 3, # sizes - [1, 2], # boxes - cls.path_ri - ) + elif sys.platform == "linux": + self.path_ri = os.path.join(pkg_path, "tools", "RI_unix", "RI3.6-release", "ri36") - def test_create_connectivity_database(self): - ref_db = sqlite3.connect(self.to_test_data("connectivity.sqlite")) - ref_db_cursor = ref_db.cursor() - ref_db_cursor.execute("SELECT * FROM subgraphs") + create_connectivity_database(self.to_test_results("connectivity.sqlite"), + 3, # sizes + [1, 2], # boxes + self.path_ri + ) + + ref_db = sqlite3.connect(self.to_test_data("connectivity.sqlite")) + ref_db_cursor = ref_db.cursor() + ref_db_cursor.execute("SELECT * FROM subgraphs") - test_db = sqlite3.connect(self.to_test_results("connectivity.sqlite")) - test_db_cursor = test_db.cursor() - test_db_cursor.execute("SELECT * FROM subgraphs") + test_db = sqlite3.connect(self.to_test_results("connectivity.sqlite")) + test_db_cursor = test_db.cursor() + test_db_cursor.execute("SELECT * FROM subgraphs") - ref_rows = {} - for row in ref_db_cursor.fetchall(): - ref_rows[row[0]] = row + test_rows = {} + for row in test_db_cursor.fetchall(): + test_rows[row[0]] = row - for row in test_db_cursor.fetchall(): - self.assertEqual(row, ref_rows[row[0]]) + for row in ref_db_cursor.fetchall(): + self.assertEqual(row, test_rows[row[0]]) - ref_db.close() - test_db.close() + ref_db.close() + test_db.close() if __name__ == '__main__': diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..513fb68 --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2019-2020 Ralf Weber +# +# This file is part of MetaboBlend. +# +# MetaboBlend is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MetaboBlend is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MetaboBlend. If not, see . +# + + +import os +import copy +import shutil +import tempfile +import unittest +from metaboblend.parse import * + + +class IsomorphDbTestCase(unittest.TestCase): + temp_results_dir = None + + @classmethod + def to_test_results(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, *args) + + @classmethod + def to_test_data(cls, *args): + return os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.temp_results_dir.name, "test_data", *args) + + @classmethod + def setUpClass(cls): + cls.temp_results_dir = tempfile.TemporaryDirectory(dir=os.path.dirname(os.path.realpath(__file__))) + + shutil.copytree(os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data"), + cls.to_test_results("test_data")) + + cls.neutral_fragment_masses = [155.00332400000002, 173.01262400000002, 175.004724, + 250.052324, 251.054324, 252.049224] + cls.exact_mass = 250.052424 + cls.mf = [10, 10, 4, 2, 0, 1] + cls.precursor_mz = 251.0597 + cls.fragment_mzs = [156.0106, 174.0199, 176.012, 251.0596, 252.0616, 253.0565] + + def test_parse_msp(self): + for i, ms in enumerate(parse_msp(self.to_test_data("mona_msp.msp"))): + + if i < 2: + self.assertEqual(ms, None) + else: + self.assertNotEqual(ms, None) + + self.assertEqual(ms, {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, + "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+", + "exact_mass": self.exact_mass, "neutral_fragment_masses": self.neutral_fragment_masses}) + + self.assertEqual(list(parse_msp(self.to_test_data("massbank_msp.txt")))[0], None) + + # ensure that parse_msp provides same output as parse_ms_data when providing an msn file + for parse_msp_dict, parse_ms_dict in zip(parse_msp(self.to_test_data("mona_msp.msp")), + parse_ms_data(self.to_test_data("mona_msp.msp"))): + + self.assertEqual(parse_msp_dict, parse_ms_dict) + + def test_parse_ms_data(self): + + # exact mass and neutral fragment masses should not be overwritten by parse_ms_data + full_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, + "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+", "exact_mass": "abcd", + "neutral_fragment_masses": ["a", "b", "c", "d"]} + + self.assertEqual(list(parse_ms_data({"AU101101": copy.deepcopy(full_ms_dict)}))[0], full_ms_dict) + + # if exact mass is present should not be overwritten by parse_ms_data + exact_mass_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, + "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+", "exact_mass": "abc"} + + parsed_exact_mass_ms_dict = list(parse_ms_data({"test": copy.deepcopy(exact_mass_ms_dict)}))[0] + exact_mass_ms_dict["ms_id"] = "test" + exact_mass_ms_dict["neutral_fragment_masses"] = self.neutral_fragment_masses + self.assertEqual(parsed_exact_mass_ms_dict, exact_mass_ms_dict) + + # neutral fragment masses should not be overwritten by parse_ms_data + neutral_fragment_masses_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, + "precursor_type": "[M+H]+", "fragment_mzs": self.fragment_mzs, + "neutral_fragment_masses": ["a", "b", "c", "d"]} + + parsed_neutral_fragment_masses_ms_dict = list(parse_ms_data({"AU101101": copy.deepcopy(neutral_fragment_masses_ms_dict)}))[0] + neutral_fragment_masses_ms_dict["exact_mass"] = self.exact_mass + self.assertEqual(parsed_neutral_fragment_masses_ms_dict, neutral_fragment_masses_ms_dict) + + uncalculated_ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, + "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+"} + parsed_uncalculated_ms_dict = list(parse_ms_data({"AU101101": copy.deepcopy(uncalculated_ms_dict)}))[0] + uncalculated_ms_dict["exact_mass"] = self.exact_mass + uncalculated_ms_dict["neutral_fragment_masses"] = self.neutral_fragment_masses + self.assertEqual(parsed_uncalculated_ms_dict, uncalculated_ms_dict) + + # test with msn=False + generate_structures_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, + "prescribed_mass": "m", "precursor_type": "[M+H]+"} + parsed_generate_structures_dict = list(parse_ms_data({"AU101101": copy.deepcopy(generate_structures_dict)}, False))[0] + generate_structures_dict["exact_mass"] = self.exact_mass + self.assertEqual(parsed_generate_structures_dict, generate_structures_dict) + + # test with exact mass provided + generate_structures_dict["exact_mass"] = "a" + parsed_generate_structures_dict = list(parse_ms_data({"AU101101": copy.deepcopy(generate_structures_dict)}, False))[0] + self.assertEqual(parsed_generate_structures_dict, generate_structures_dict) + + def test_precursor_ions_to_neutral_masses(self): + + ms_dict = {"ms_id": "AU101101", "mf": self.mf, "precursor_mz": self.precursor_mz, + "fragment_mzs": self.fragment_mzs, "precursor_type": "[M+H]+"} + + for which in ["both", "fragments", "precursor", "none"]: + processed_ms_dict = precursor_ions_to_neutral_masses(copy.deepcopy(ms_dict), which) + + if which in ["both", "fragments"]: + self.assertEqual(processed_ms_dict["neutral_fragment_masses"], self.neutral_fragment_masses) + + if which in ["both", "precursor"]: + self.assertEqual(processed_ms_dict["exact_mass"], self.exact_mass) + + ms_dict["precursor_type"] = "[M-H]-" + + for which in ["both", "fragments", "precursor", "none"]: + processed_ms_dict = precursor_ions_to_neutral_masses(copy.deepcopy(ms_dict), which) + + if which in ["both", "fragments"]: + neutral_fragment_masses = [nfm + 1.007276 for nfm in self.fragment_mzs] + self.assertEqual(processed_ms_dict["neutral_fragment_masses"], neutral_fragment_masses) + + if which in ["both", "precursor"]: + self.assertEqual(processed_ms_dict["exact_mass"], self.precursor_mz + 1.007276) + + def test_reformat_msp_input(self): + + unformatted_msp_dict = {'ms_id': 'AU101101', 'mf': 'C10H10N4O2S', 'precursor_mz': '251.0597', + 'fragment_mzs': self.fragment_mzs, + 'precursor_type': '[M+H]+'} + + formatted_msp_dict = {'ms_id': 'AU101101', 'mf': self.mf, 'precursor_mz': self.precursor_mz, + 'fragment_mzs': self.fragment_mzs, 'precursor_type': '[M+H]+', + 'exact_mass': self.exact_mass, + 'neutral_fragment_masses': self.neutral_fragment_masses} + + self.assertEqual(reformat_msp_input(unformatted_msp_dict), formatted_msp_dict) + + unformatted_msp_dict["precursor_mz"] = None + self.assertWarns(UserWarning, reformat_msp_input(unformatted_msp_dict)) + + unformatted_msp_dict["precursor_mz"] = self.precursor_mz + unformatted_msp_dict["fragment_mzs"] = [] + self.assertWarns(UserWarning, reformat_msp_input(unformatted_msp_dict)) + + def test_mc_to_list(self): + + mc_lists = [[12, 14, 4, 4, 0, 1], [10, 10, 4, 2, 0, 1], [46, 94, 1, 8, 1, 0], [46, 94, 1, 8, 1, 0], None] + + for i, word_formula in enumerate(["C12H14N4O4S", "C10H10N4O2S", "C46H94NO8P", "C46H94NO8P1", "C10H9ClN4O2S"]): + self.assertEqual(mc_to_list(word_formula), mc_lists[i]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_suite_auxiliary.py b/tests/test_suite_auxiliary.py index c44171d..cc211e2 100644 --- a/tests/test_suite_auxiliary.py +++ b/tests/test_suite_auxiliary.py @@ -27,6 +27,7 @@ from pathlib import Path from . import test_auxiliary +from . import test_parse sys.path.insert(0, str(Path(__file__).parent.parent.resolve())) @@ -35,6 +36,7 @@ suite = unittest.TestSuite() suite.addTest(unittest.findTestCases(test_auxiliary)) + suite.addTest(unittest.findTestCases(test_parse)) report = os.path.join(os.path.abspath(os.path.join(__file__, os.pardir)), 'results', 'results_test_suite_auxiliary') runTestSuite(suite, report, title='Process Test Suite Report', verbosity=2)