Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stricter checks for SDF sniffing #4729

Merged
merged 2 commits into from Oct 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
75 changes: 53 additions & 22 deletions lib/galaxy/datatypes/molecules.py
Expand Up @@ -55,7 +55,7 @@ def count_lines(filename, non_empty=False):

class GenericMolFile(data.Text):
"""
abstract class for most of the molecule files
Abstract class for most of the molecule files.
"""
MetadataElement(name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0)

Expand Down Expand Up @@ -92,26 +92,53 @@ def sniff(self, filename):
"""
Try to guess if the file is a SDF2 file.

An SDfile (structure-data file) can contain multiple compounds.

Each compound starts with a block in V2000 or V3000 molfile format,
which ends with a line equal to 'M END'.
This is followed by a non-structural data block, which ends with a line
equal to '$$$$'.

>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('drugbank_drugs.sdf')
>>> SDF().sniff(fname)
True

>>> fname = get_test_fname('drugbank_drugs.cml')
>>> fname = get_test_fname('github88.v3k.sdf')
>>> SDF().sniff(fname)
True

>>> fname = get_test_fname('chebi_57262.v3k.mol')
>>> SDF().sniff(fname)
False
"""
counter = count_special_lines("^M\s*END", filename) + count_special_lines("^\$\$\$\$", filename)
if counter > 0 and counter % 2 == 0:
return True
else:
return False
m_end_found = False
limit = 10000
idx = 0
with open(filename) as in_file:
for line in in_file:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We really need to stop doing this - this line can consume unbounded memory on relatively sane user supplied inputs. I guess there are a lot of instances of it though and I'm not finding my previous advice on fixing it. (I believe the right thing to do is pick a maximum line size and use readlines directly instead of the iterator).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xref. #4319

idx += 1
line = line.rstrip('\n\r')
if idx < 4:
continue
elif idx == 4:
if len(line) != 39 or not(line.endswith(' V2000') or
line.endswith(' V3000')):
return False
elif not m_end_found:
if line == 'M END':
m_end_found = True
elif line == '$$$$':
return True
if idx == limit:
break
return False

def set_meta(self, dataset, **kwd):
"""
Set the number of molecules in dataset.
"""
dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name)
dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$$", dataset.file_name)

def split(cls, input_datasets, subdir_generator_function, split_params):
"""
Expand Down Expand Up @@ -180,10 +207,17 @@ def sniff(self, filename):
>>> MOL2().sniff(fname)
False
"""
if count_special_lines("@<TRIPOS>MOLECULE", filename) > 0:
return True
else:
return False
limit = 60
idx = 0
with open(filename) as in_file:
for line in in_file:
line = line.rstrip('\n\r')
if line == '@<TRIPOS>MOLECULE':
return True
idx += 1
if idx == limit:
break
return False

def set_meta(self, dataset, **kwd):
"""
Expand Down Expand Up @@ -716,16 +750,13 @@ def sniff(self, filename):
>>> CML().sniff(fname)
True
"""
handle = open(filename)
line = handle.readline()
if line.strip() != '<?xml version="1.0"?>':
handle.close()
return False
line = handle.readline()
if line.strip().find('http://www.xml-cml.org/schema') == -1:
handle.close()
return False
handle.close()
with open(filename) as handle:
line = handle.readline()
if line.strip() != '<?xml version="1.0"?>':
return False
line = handle.readline()
if line.strip().find('http://www.xml-cml.org/schema') == -1:
return False
return True

def split(cls, input_datasets, subdir_generator_function, split_params):
Expand Down
55 changes: 55 additions & 0 deletions lib/galaxy/datatypes/test/chebi_57262.v3k.mol
@@ -0,0 +1,55 @@
CHEBI:57262
RDKit 3D

0 0 0 0 0 0 0 0 0 0999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 22 21 0 0 0
M V30 BEGIN ATOM
M V30 1 P 18.2567 -5.6825 0 0
M V30 2 O 18.0395 -6.6275 0 0 CHG=-1
M V30 3 O 18.255 -4.7415 0 0
M V30 4 O 18.9453 -6.0633 0 0
M V30 5 O 17.4245 -5.9341 0 0
M V30 6 C 15.2886 -5.4953 0 0
M V30 7 C 15.9989 -5.9147 0 0
M V30 8 C 16.7174 -5.5091 0 0
M V30 9 O 14.5702 -5.9009 0 0
M V30 10 C 13.8556 -5.4886 0 0
M V30 11 O 13.8553 -4.6636 0 0
M V30 12 O 15.9909 -6.7396 0 0
M V30 13 O 21.0888 -4.8258 0 0
M V30 14 C 21.0888 -5.6508 0 0
M V30 15 C 20.3743 -6.0633 0 0
M V30 16 N 20.3743 -6.8883 0 0 CHG=1
M V30 17 C 19.6598 -5.6508 0 0
M V30 18 O 21.8033 -6.0633 0 0 CHG=-1
M V30 19 R 13.1411 -5.9011 0 0 MASS=1
M V30 20 C 15.2764 -7.1521 0 0
M V30 21 O 15.2764 -7.9771 0 0
M V30 22 R 14.562 -6.7396 0 0 MASS=2
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 2 1 3
M V30 3 1 1 4
M V30 4 1 6 9
M V30 5 1 7 6 CFG=3
M V30 6 1 8 7
M V30 7 1 5 8
M V30 8 1 10 9
M V30 9 2 10 11
M V30 10 1 7 12
M V30 11 1 1 5
M V30 12 1 15 14 CFG=1
M V30 13 2 14 13
M V30 14 1 15 16
M V30 15 1 15 17
M V30 16 1 17 4
M V30 17 1 18 14
M V30 18 1 10 19
M V30 19 1 12 20
M V30 20 2 20 21
M V30 21 1 20 22
M V30 END BOND
M V30 END CTAB
M END
32 changes: 32 additions & 0 deletions lib/galaxy/datatypes/test/github88.v3k.sdf
@@ -0,0 +1,32 @@

Marvin 06030906502D

0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 8 8 0 0 0
M V30 BEGIN ATOM
M V30 1 C 0.7476 -1.4581 0 0
M V30 2 C -0.7885 -1.4581 0 0
M V30 3 C 1.6627 -2.6967 0 0
M V30 4 O 1.6627 -0.2009 0 0
M V30 5 O -1.5659 -0.1115 0 0
M V30 6 O -1.5659 -2.786 0 0
M V30 7 C 3.1431 -2.2244 0 0
M V30 8 C 3.1431 -0.6844 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 1 1 3
M V30 3 1 1 4
M V30 4 1 2 5
M V30 5 2 2 6
M V30 6 1 3 7
M V30 7 1 4 8
M V30 8 1 7 8
M V30 END BOND
M V30 END CTAB
M END
> <prop1>
4

$$$$