From 10b9ac8c7de9e7f2c04d6a25777209abb350a0cc Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Fri, 29 Sep 2017 20:37:15 +0100 Subject: [PATCH 1/2] Stricter checks for SDF sniffing I had a bunch of protein FASTA files being sniffed as 'sdf'. Also fix https://github.com/galaxyproject/galaxy/issues/4558 . --- lib/galaxy/datatypes/molecules.py | 58 ++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/lib/galaxy/datatypes/molecules.py b/lib/galaxy/datatypes/molecules.py index bc65b2a7c07f..62c7e154efc3 100644 --- a/lib/galaxy/datatypes/molecules.py +++ b/lib/galaxy/datatypes/molecules.py @@ -92,6 +92,10 @@ def sniff(self, filename): """ Try to guess if the file is a SDF2 file. + An SDF file can contain multiple molecules. + Each molecule must contain a line equal to 'M END' followed later on by + a final line equal to '$$$$'. + >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('drugbank_drugs.sdf') >>> SDF().sniff(fname) @@ -101,17 +105,27 @@ def sniff(self, filename): >>> SDF().sniff(fname) False """ - counter = count_special_lines("^M\s*END", filename) + count_special_lines("^\$\$\$\$", filename) - if counter > 0 and counter % 2 == 0: - return True - else: - return False + m_end_found = False + limit = 500 + idx = 0 + with open(filename) as in_file: + for line in in_file: + line = line.rstrip('\n\r') + if not m_end_found: + if line == 'M END': + m_end_found = True + elif line == '$$$$': + return True + idx += 1 + if idx == limit: + break + return False def set_meta(self, dataset, **kwd): """ Set the number of molecules in dataset. """ - dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name) + dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$$", dataset.file_name) def split(cls, input_datasets, subdir_generator_function, split_params): """ @@ -180,10 +194,17 @@ def sniff(self, filename): >>> MOL2().sniff(fname) False """ - if count_special_lines("@MOLECULE", filename) > 0: - return True - else: - return False + limit = 60 + idx = 0 + with open(filename) as in_file: + for line in in_file: + line = line.rstrip('\n\r') + if line == '@MOLECULE': + return True + idx += 1 + if idx == limit: + break + return False def set_meta(self, dataset, **kwd): """ @@ -716,16 +737,13 @@ def sniff(self, filename): >>> CML().sniff(fname) True """ - handle = open(filename) - line = handle.readline() - if line.strip() != '': - handle.close() - return False - line = handle.readline() - if line.strip().find('http://www.xml-cml.org/schema') == -1: - handle.close() - return False - handle.close() + with open(filename) as handle: + line = handle.readline() + if line.strip() != '': + return False + line = handle.readline() + if line.strip().find('http://www.xml-cml.org/schema') == -1: + return False return True def split(cls, input_datasets, subdir_generator_function, split_params): From e3307c5be96d479dc7731fbfefefbb1bc6854fa5 Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Sun, 1 Oct 2017 01:30:44 +0100 Subject: [PATCH 2/2] More precise SDF sniffer First check 4th line: if this is conforming, scan up to 10000 lines. Add 2 test files downloaded from: https://github.com/rdkit/rdkit/tree/master/Code/GraphMol/FileParsers/test_data --- lib/galaxy/datatypes/molecules.py | 29 +++++++--- lib/galaxy/datatypes/test/chebi_57262.v3k.mol | 55 +++++++++++++++++++ lib/galaxy/datatypes/test/github88.v3k.sdf | 32 +++++++++++ 3 files changed, 108 insertions(+), 8 deletions(-) create mode 100644 lib/galaxy/datatypes/test/chebi_57262.v3k.mol create mode 100644 lib/galaxy/datatypes/test/github88.v3k.sdf diff --git a/lib/galaxy/datatypes/molecules.py b/lib/galaxy/datatypes/molecules.py index 62c7e154efc3..21cc3b7e7c63 100644 --- a/lib/galaxy/datatypes/molecules.py +++ b/lib/galaxy/datatypes/molecules.py @@ -55,7 +55,7 @@ def count_lines(filename, non_empty=False): class GenericMolFile(data.Text): """ - abstract class for most of the molecule files + Abstract class for most of the molecule files. """ MetadataElement(name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0) @@ -92,31 +92,44 @@ def sniff(self, filename): """ Try to guess if the file is a SDF2 file. - An SDF file can contain multiple molecules. - Each molecule must contain a line equal to 'M END' followed later on by - a final line equal to '$$$$'. + An SDfile (structure-data file) can contain multiple compounds. + + Each compound starts with a block in V2000 or V3000 molfile format, + which ends with a line equal to 'M END'. + This is followed by a non-structural data block, which ends with a line + equal to '$$$$'. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('drugbank_drugs.sdf') >>> SDF().sniff(fname) True - >>> fname = get_test_fname('drugbank_drugs.cml') + >>> fname = get_test_fname('github88.v3k.sdf') + >>> SDF().sniff(fname) + True + + >>> fname = get_test_fname('chebi_57262.v3k.mol') >>> SDF().sniff(fname) False """ m_end_found = False - limit = 500 + limit = 10000 idx = 0 with open(filename) as in_file: for line in in_file: + idx += 1 line = line.rstrip('\n\r') - if not m_end_found: + if idx < 4: + continue + elif idx == 4: + if len(line) != 39 or not(line.endswith(' V2000') or + line.endswith(' V3000')): + return False + elif not m_end_found: if line == 'M END': m_end_found = True elif line == '$$$$': return True - idx += 1 if idx == limit: break return False diff --git a/lib/galaxy/datatypes/test/chebi_57262.v3k.mol b/lib/galaxy/datatypes/test/chebi_57262.v3k.mol new file mode 100644 index 000000000000..210b6c49b387 --- /dev/null +++ b/lib/galaxy/datatypes/test/chebi_57262.v3k.mol @@ -0,0 +1,55 @@ +CHEBI:57262 + RDKit 3D + + 0 0 0 0 0 0 0 0 0 0999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 22 21 0 0 0 +M V30 BEGIN ATOM +M V30 1 P 18.2567 -5.6825 0 0 +M V30 2 O 18.0395 -6.6275 0 0 CHG=-1 +M V30 3 O 18.255 -4.7415 0 0 +M V30 4 O 18.9453 -6.0633 0 0 +M V30 5 O 17.4245 -5.9341 0 0 +M V30 6 C 15.2886 -5.4953 0 0 +M V30 7 C 15.9989 -5.9147 0 0 +M V30 8 C 16.7174 -5.5091 0 0 +M V30 9 O 14.5702 -5.9009 0 0 +M V30 10 C 13.8556 -5.4886 0 0 +M V30 11 O 13.8553 -4.6636 0 0 +M V30 12 O 15.9909 -6.7396 0 0 +M V30 13 O 21.0888 -4.8258 0 0 +M V30 14 C 21.0888 -5.6508 0 0 +M V30 15 C 20.3743 -6.0633 0 0 +M V30 16 N 20.3743 -6.8883 0 0 CHG=1 +M V30 17 C 19.6598 -5.6508 0 0 +M V30 18 O 21.8033 -6.0633 0 0 CHG=-1 +M V30 19 R 13.1411 -5.9011 0 0 MASS=1 +M V30 20 C 15.2764 -7.1521 0 0 +M V30 21 O 15.2764 -7.9771 0 0 +M V30 22 R 14.562 -6.7396 0 0 MASS=2 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 2 +M V30 2 2 1 3 +M V30 3 1 1 4 +M V30 4 1 6 9 +M V30 5 1 7 6 CFG=3 +M V30 6 1 8 7 +M V30 7 1 5 8 +M V30 8 1 10 9 +M V30 9 2 10 11 +M V30 10 1 7 12 +M V30 11 1 1 5 +M V30 12 1 15 14 CFG=1 +M V30 13 2 14 13 +M V30 14 1 15 16 +M V30 15 1 15 17 +M V30 16 1 17 4 +M V30 17 1 18 14 +M V30 18 1 10 19 +M V30 19 1 12 20 +M V30 20 2 20 21 +M V30 21 1 20 22 +M V30 END BOND +M V30 END CTAB +M END diff --git a/lib/galaxy/datatypes/test/github88.v3k.sdf b/lib/galaxy/datatypes/test/github88.v3k.sdf new file mode 100644 index 000000000000..5005395628ff --- /dev/null +++ b/lib/galaxy/datatypes/test/github88.v3k.sdf @@ -0,0 +1,32 @@ + + Marvin 06030906502D + + 0 0 0 0 0 999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 8 8 0 0 0 +M V30 BEGIN ATOM +M V30 1 C 0.7476 -1.4581 0 0 +M V30 2 C -0.7885 -1.4581 0 0 +M V30 3 C 1.6627 -2.6967 0 0 +M V30 4 O 1.6627 -0.2009 0 0 +M V30 5 O -1.5659 -0.1115 0 0 +M V30 6 O -1.5659 -2.786 0 0 +M V30 7 C 3.1431 -2.2244 0 0 +M V30 8 C 3.1431 -0.6844 0 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 2 +M V30 2 1 1 3 +M V30 3 1 1 4 +M V30 4 1 2 5 +M V30 5 2 2 6 +M V30 6 1 3 7 +M V30 7 1 4 8 +M V30 8 1 7 8 +M V30 END BOND +M V30 END CTAB +M END +> +4 + +$$$$