diff --git a/lib/galaxy/datatypes/molecules.py b/lib/galaxy/datatypes/molecules.py index bc65b2a7c07f..21cc3b7e7c63 100644 --- a/lib/galaxy/datatypes/molecules.py +++ b/lib/galaxy/datatypes/molecules.py @@ -55,7 +55,7 @@ def count_lines(filename, non_empty=False): class GenericMolFile(data.Text): """ - abstract class for most of the molecule files + Abstract class for most of the molecule files. """ MetadataElement(name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0) @@ -92,26 +92,53 @@ def sniff(self, filename): """ Try to guess if the file is a SDF2 file. + An SDfile (structure-data file) can contain multiple compounds. + + Each compound starts with a block in V2000 or V3000 molfile format, + which ends with a line equal to 'M END'. + This is followed by a non-structural data block, which ends with a line + equal to '$$$$'. + >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('drugbank_drugs.sdf') >>> SDF().sniff(fname) True - >>> fname = get_test_fname('drugbank_drugs.cml') + >>> fname = get_test_fname('github88.v3k.sdf') + >>> SDF().sniff(fname) + True + + >>> fname = get_test_fname('chebi_57262.v3k.mol') >>> SDF().sniff(fname) False """ - counter = count_special_lines("^M\s*END", filename) + count_special_lines("^\$\$\$\$", filename) - if counter > 0 and counter % 2 == 0: - return True - else: - return False + m_end_found = False + limit = 10000 + idx = 0 + with open(filename) as in_file: + for line in in_file: + idx += 1 + line = line.rstrip('\n\r') + if idx < 4: + continue + elif idx == 4: + if len(line) != 39 or not(line.endswith(' V2000') or + line.endswith(' V3000')): + return False + elif not m_end_found: + if line == 'M END': + m_end_found = True + elif line == '$$$$': + return True + if idx == limit: + break + return False def set_meta(self, dataset, **kwd): """ Set the number of molecules in dataset. """ - dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name) + dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$$", dataset.file_name) def split(cls, input_datasets, subdir_generator_function, split_params): """ @@ -180,10 +207,17 @@ def sniff(self, filename): >>> MOL2().sniff(fname) False """ - if count_special_lines("@MOLECULE", filename) > 0: - return True - else: - return False + limit = 60 + idx = 0 + with open(filename) as in_file: + for line in in_file: + line = line.rstrip('\n\r') + if line == '@MOLECULE': + return True + idx += 1 + if idx == limit: + break + return False def set_meta(self, dataset, **kwd): """ @@ -716,16 +750,13 @@ def sniff(self, filename): >>> CML().sniff(fname) True """ - handle = open(filename) - line = handle.readline() - if line.strip() != '': - handle.close() - return False - line = handle.readline() - if line.strip().find('http://www.xml-cml.org/schema') == -1: - handle.close() - return False - handle.close() + with open(filename) as handle: + line = handle.readline() + if line.strip() != '': + return False + line = handle.readline() + if line.strip().find('http://www.xml-cml.org/schema') == -1: + return False return True def split(cls, input_datasets, subdir_generator_function, split_params): diff --git a/lib/galaxy/datatypes/test/chebi_57262.v3k.mol b/lib/galaxy/datatypes/test/chebi_57262.v3k.mol new file mode 100644 index 000000000000..210b6c49b387 --- /dev/null +++ b/lib/galaxy/datatypes/test/chebi_57262.v3k.mol @@ -0,0 +1,55 @@ +CHEBI:57262 + RDKit 3D + + 0 0 0 0 0 0 0 0 0 0999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 22 21 0 0 0 +M V30 BEGIN ATOM +M V30 1 P 18.2567 -5.6825 0 0 +M V30 2 O 18.0395 -6.6275 0 0 CHG=-1 +M V30 3 O 18.255 -4.7415 0 0 +M V30 4 O 18.9453 -6.0633 0 0 +M V30 5 O 17.4245 -5.9341 0 0 +M V30 6 C 15.2886 -5.4953 0 0 +M V30 7 C 15.9989 -5.9147 0 0 +M V30 8 C 16.7174 -5.5091 0 0 +M V30 9 O 14.5702 -5.9009 0 0 +M V30 10 C 13.8556 -5.4886 0 0 +M V30 11 O 13.8553 -4.6636 0 0 +M V30 12 O 15.9909 -6.7396 0 0 +M V30 13 O 21.0888 -4.8258 0 0 +M V30 14 C 21.0888 -5.6508 0 0 +M V30 15 C 20.3743 -6.0633 0 0 +M V30 16 N 20.3743 -6.8883 0 0 CHG=1 +M V30 17 C 19.6598 -5.6508 0 0 +M V30 18 O 21.8033 -6.0633 0 0 CHG=-1 +M V30 19 R 13.1411 -5.9011 0 0 MASS=1 +M V30 20 C 15.2764 -7.1521 0 0 +M V30 21 O 15.2764 -7.9771 0 0 +M V30 22 R 14.562 -6.7396 0 0 MASS=2 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 2 +M V30 2 2 1 3 +M V30 3 1 1 4 +M V30 4 1 6 9 +M V30 5 1 7 6 CFG=3 +M V30 6 1 8 7 +M V30 7 1 5 8 +M V30 8 1 10 9 +M V30 9 2 10 11 +M V30 10 1 7 12 +M V30 11 1 1 5 +M V30 12 1 15 14 CFG=1 +M V30 13 2 14 13 +M V30 14 1 15 16 +M V30 15 1 15 17 +M V30 16 1 17 4 +M V30 17 1 18 14 +M V30 18 1 10 19 +M V30 19 1 12 20 +M V30 20 2 20 21 +M V30 21 1 20 22 +M V30 END BOND +M V30 END CTAB +M END diff --git a/lib/galaxy/datatypes/test/github88.v3k.sdf b/lib/galaxy/datatypes/test/github88.v3k.sdf new file mode 100644 index 000000000000..5005395628ff --- /dev/null +++ b/lib/galaxy/datatypes/test/github88.v3k.sdf @@ -0,0 +1,32 @@ + + Marvin 06030906502D + + 0 0 0 0 0 999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 8 8 0 0 0 +M V30 BEGIN ATOM +M V30 1 C 0.7476 -1.4581 0 0 +M V30 2 C -0.7885 -1.4581 0 0 +M V30 3 C 1.6627 -2.6967 0 0 +M V30 4 O 1.6627 -0.2009 0 0 +M V30 5 O -1.5659 -0.1115 0 0 +M V30 6 O -1.5659 -2.786 0 0 +M V30 7 C 3.1431 -2.2244 0 0 +M V30 8 C 3.1431 -0.6844 0 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 2 +M V30 2 1 1 3 +M V30 3 1 1 4 +M V30 4 1 2 5 +M V30 5 2 2 6 +M V30 6 1 3 7 +M V30 7 1 4 8 +M V30 8 1 7 8 +M V30 END BOND +M V30 END CTAB +M END +> +4 + +$$$$