Skip to content

Commit

Permalink
Merge pull request #4729 from nsoranzo/fix_molecule_datatypes
Browse files Browse the repository at this point in the history
Stricter checks for SDF sniffing
  • Loading branch information
jmchilton committed Oct 1, 2017
2 parents 73bb4ba + e3307c5 commit 068c61a
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 22 deletions.
75 changes: 53 additions & 22 deletions lib/galaxy/datatypes/molecules.py
Expand Up @@ -55,7 +55,7 @@ def count_lines(filename, non_empty=False):

class GenericMolFile(data.Text):
"""
abstract class for most of the molecule files
Abstract class for most of the molecule files.
"""
MetadataElement(name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0)

Expand Down Expand Up @@ -92,26 +92,53 @@ def sniff(self, filename):
"""
Try to guess if the file is a SDF2 file.
An SDfile (structure-data file) can contain multiple compounds.
Each compound starts with a block in V2000 or V3000 molfile format,
which ends with a line equal to 'M END'.
This is followed by a non-structural data block, which ends with a line
equal to '$$$$'.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('drugbank_drugs.sdf')
>>> SDF().sniff(fname)
True
>>> fname = get_test_fname('drugbank_drugs.cml')
>>> fname = get_test_fname('github88.v3k.sdf')
>>> SDF().sniff(fname)
True
>>> fname = get_test_fname('chebi_57262.v3k.mol')
>>> SDF().sniff(fname)
False
"""
counter = count_special_lines("^M\s*END", filename) + count_special_lines("^\$\$\$\$", filename)
if counter > 0 and counter % 2 == 0:
return True
else:
return False
m_end_found = False
limit = 10000
idx = 0
with open(filename) as in_file:
for line in in_file:
idx += 1
line = line.rstrip('\n\r')
if idx < 4:
continue
elif idx == 4:
if len(line) != 39 or not(line.endswith(' V2000') or
line.endswith(' V3000')):
return False
elif not m_end_found:
if line == 'M END':
m_end_found = True
elif line == '$$$$':
return True
if idx == limit:
break
return False

def set_meta(self, dataset, **kwd):
"""
Set the number of molecules in dataset.
"""
dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name)
dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$$", dataset.file_name)

def split(cls, input_datasets, subdir_generator_function, split_params):
"""
Expand Down Expand Up @@ -180,10 +207,17 @@ def sniff(self, filename):
>>> MOL2().sniff(fname)
False
"""
if count_special_lines("@<TRIPOS>MOLECULE", filename) > 0:
return True
else:
return False
limit = 60
idx = 0
with open(filename) as in_file:
for line in in_file:
line = line.rstrip('\n\r')
if line == '@<TRIPOS>MOLECULE':
return True
idx += 1
if idx == limit:
break
return False

def set_meta(self, dataset, **kwd):
"""
Expand Down Expand Up @@ -716,16 +750,13 @@ def sniff(self, filename):
>>> CML().sniff(fname)
True
"""
handle = open(filename)
line = handle.readline()
if line.strip() != '<?xml version="1.0"?>':
handle.close()
return False
line = handle.readline()
if line.strip().find('http://www.xml-cml.org/schema') == -1:
handle.close()
return False
handle.close()
with open(filename) as handle:
line = handle.readline()
if line.strip() != '<?xml version="1.0"?>':
return False
line = handle.readline()
if line.strip().find('http://www.xml-cml.org/schema') == -1:
return False
return True

def split(cls, input_datasets, subdir_generator_function, split_params):
Expand Down
55 changes: 55 additions & 0 deletions lib/galaxy/datatypes/test/chebi_57262.v3k.mol
@@ -0,0 +1,55 @@
CHEBI:57262
RDKit 3D

0 0 0 0 0 0 0 0 0 0999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 22 21 0 0 0
M V30 BEGIN ATOM
M V30 1 P 18.2567 -5.6825 0 0
M V30 2 O 18.0395 -6.6275 0 0 CHG=-1
M V30 3 O 18.255 -4.7415 0 0
M V30 4 O 18.9453 -6.0633 0 0
M V30 5 O 17.4245 -5.9341 0 0
M V30 6 C 15.2886 -5.4953 0 0
M V30 7 C 15.9989 -5.9147 0 0
M V30 8 C 16.7174 -5.5091 0 0
M V30 9 O 14.5702 -5.9009 0 0
M V30 10 C 13.8556 -5.4886 0 0
M V30 11 O 13.8553 -4.6636 0 0
M V30 12 O 15.9909 -6.7396 0 0
M V30 13 O 21.0888 -4.8258 0 0
M V30 14 C 21.0888 -5.6508 0 0
M V30 15 C 20.3743 -6.0633 0 0
M V30 16 N 20.3743 -6.8883 0 0 CHG=1
M V30 17 C 19.6598 -5.6508 0 0
M V30 18 O 21.8033 -6.0633 0 0 CHG=-1
M V30 19 R 13.1411 -5.9011 0 0 MASS=1
M V30 20 C 15.2764 -7.1521 0 0
M V30 21 O 15.2764 -7.9771 0 0
M V30 22 R 14.562 -6.7396 0 0 MASS=2
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 2 1 3
M V30 3 1 1 4
M V30 4 1 6 9
M V30 5 1 7 6 CFG=3
M V30 6 1 8 7
M V30 7 1 5 8
M V30 8 1 10 9
M V30 9 2 10 11
M V30 10 1 7 12
M V30 11 1 1 5
M V30 12 1 15 14 CFG=1
M V30 13 2 14 13
M V30 14 1 15 16
M V30 15 1 15 17
M V30 16 1 17 4
M V30 17 1 18 14
M V30 18 1 10 19
M V30 19 1 12 20
M V30 20 2 20 21
M V30 21 1 20 22
M V30 END BOND
M V30 END CTAB
M END
32 changes: 32 additions & 0 deletions lib/galaxy/datatypes/test/github88.v3k.sdf
@@ -0,0 +1,32 @@

Marvin 06030906502D

0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 8 8 0 0 0
M V30 BEGIN ATOM
M V30 1 C 0.7476 -1.4581 0 0
M V30 2 C -0.7885 -1.4581 0 0
M V30 3 C 1.6627 -2.6967 0 0
M V30 4 O 1.6627 -0.2009 0 0
M V30 5 O -1.5659 -0.1115 0 0
M V30 6 O -1.5659 -2.786 0 0
M V30 7 C 3.1431 -2.2244 0 0
M V30 8 C 3.1431 -0.6844 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 1 1 3
M V30 3 1 1 4
M V30 4 1 2 5
M V30 5 2 2 6
M V30 6 1 3 7
M V30 7 1 4 8
M V30 8 1 7 8
M V30 END BOND
M V30 END CTAB
M END
> <prop1>
4

$$$$

0 comments on commit 068c61a

Please sign in to comment.