From 10b9ac8c7de9e7f2c04d6a25777209abb350a0cc Mon Sep 17 00:00:00 2001
From: Nicola Soranzo <nicola.soranzo@earlham.ac.uk>
Date: Fri, 29 Sep 2017 20:37:15 +0100
Subject: [PATCH 1/2] Stricter checks for SDF sniffing

I had a bunch of protein FASTA files being sniffed as 'sdf'.

Also fix https://github.com/galaxyproject/galaxy/issues/4558 .
---
 lib/galaxy/datatypes/molecules.py | 58 ++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 20 deletions(-)
diff --git a/lib/galaxy/datatypes/molecules.py b/lib/galaxy/datatypes/molecules.py
index bc65b2a7c07f..62c7e154efc3 100644
--- a/lib/galaxy/datatypes/molecules.py
+++ b/lib/galaxy/datatypes/molecules.py
@@ -92,6 +92,10 @@ def sniff(self, filename):
         """
         Try to guess if the file is a SDF2 file.
 
+        An SDF file can contain multiple molecules.
+        Each molecule must contain a line equal to 'M  END' followed later on by
+        a final line equal to '$$$$'.
+
         >>> from galaxy.datatypes.sniff import get_test_fname
         >>> fname = get_test_fname('drugbank_drugs.sdf')
         >>> SDF().sniff(fname)
@@ -101,17 +105,27 @@ def sniff(self, filename):
         >>> SDF().sniff(fname)
         False
         """
-        counter = count_special_lines("^M\s*END", filename) + count_special_lines("^\$\$\$\$", filename)
-        if counter > 0 and counter % 2 == 0:
-            return True
-        else:
-            return False
+        m_end_found = False
+        limit = 500
+        idx = 0
+        with open(filename) as in_file:
+            for line in in_file:
+                line = line.rstrip('\n\r')
+                if not m_end_found:
+                    if line == 'M  END':
+                        m_end_found = True
+                elif line == '$$$$':
+                    return True
+                idx += 1
+                if idx == limit:
+                    break
+        return False
 
     def set_meta(self, dataset, **kwd):
         """
         Set the number of molecules in dataset.
         """
-        dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name)
+        dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$$", dataset.file_name)
 
     def split(cls, input_datasets, subdir_generator_function, split_params):
         """
@@ -180,10 +194,17 @@ def sniff(self, filename):
         >>> MOL2().sniff(fname)
         False
         """
-        if count_special_lines("@<TRIPOS>MOLECULE", filename) > 0:
-            return True
-        else:
-            return False
+        limit = 60
+        idx = 0
+        with open(filename) as in_file:
+            for line in in_file:
+                line = line.rstrip('\n\r')
+                if line == '@<TRIPOS>MOLECULE':
+                    return True
+                idx += 1
+                if idx == limit:
+                    break
+        return False
 
     def set_meta(self, dataset, **kwd):
         """
@@ -716,16 +737,13 @@ def sniff(self, filename):
         >>> CML().sniff(fname)
         True
         """
-        handle = open(filename)
-        line = handle.readline()
-        if line.strip() != '<?xml version="1.0"?>':
-            handle.close()
-            return False
-        line = handle.readline()
-        if line.strip().find('http://www.xml-cml.org/schema') == -1:
-            handle.close()
-            return False
-        handle.close()
+        with open(filename) as handle:
+            line = handle.readline()
+            if line.strip() != '<?xml version="1.0"?>':
+                return False
+            line = handle.readline()
+            if line.strip().find('http://www.xml-cml.org/schema') == -1:
+                return False
         return True
 
     def split(cls, input_datasets, subdir_generator_function, split_params):

From e3307c5be96d479dc7731fbfefefbb1bc6854fa5 Mon Sep 17 00:00:00 2001
From: Nicola Soranzo <nicola.soranzo@tgac.ac.uk>
Date: Sun, 1 Oct 2017 01:30:44 +0100
Subject: [PATCH 2/2] More precise SDF sniffer

First check 4th line: if this is conforming, scan up to 10000 lines.
Add 2 test files downloaded from:

https://github.com/rdkit/rdkit/tree/master/Code/GraphMol/FileParsers/test_data
---
 lib/galaxy/datatypes/molecules.py             | 29 +++++++---
 lib/galaxy/datatypes/test/chebi_57262.v3k.mol | 55 +++++++++++++++++++
 lib/galaxy/datatypes/test/github88.v3k.sdf    | 32 +++++++++++
 3 files changed, 108 insertions(+), 8 deletions(-)
 create mode 100644 lib/galaxy/datatypes/test/chebi_57262.v3k.mol
 create mode 100644 lib/galaxy/datatypes/test/github88.v3k.sdf

diff --git a/lib/galaxy/datatypes/molecules.py b/lib/galaxy/datatypes/molecules.py
index 62c7e154efc3..21cc3b7e7c63 100644
--- a/lib/galaxy/datatypes/molecules.py
+++ b/lib/galaxy/datatypes/molecules.py
@@ -55,7 +55,7 @@ def count_lines(filename, non_empty=False):
 
 class GenericMolFile(data.Text):
     """
-        abstract class for most of the molecule files
+    Abstract class for most of the molecule files.
     """
     MetadataElement(name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0)
 
@@ -92,31 +92,44 @@ def sniff(self, filename):
         """
         Try to guess if the file is a SDF2 file.
 
-        An SDF file can contain multiple molecules.
-        Each molecule must contain a line equal to 'M  END' followed later on by
-        a final line equal to '$$$$'.
+        An SDfile (structure-data file) can contain multiple compounds.
+
+        Each compound starts with a block in V2000 or V3000 molfile format,
+        which ends with a line equal to 'M  END'.
+        This is followed by a non-structural data block, which ends with a line
+        equal to '$$$$'.
 
         >>> from galaxy.datatypes.sniff import get_test_fname
         >>> fname = get_test_fname('drugbank_drugs.sdf')
         >>> SDF().sniff(fname)
         True
 
-        >>> fname = get_test_fname('drugbank_drugs.cml')
+        >>> fname = get_test_fname('github88.v3k.sdf')
+        >>> SDF().sniff(fname)
+        True
+
+        >>> fname = get_test_fname('chebi_57262.v3k.mol')
         >>> SDF().sniff(fname)
         False
         """
         m_end_found = False
-        limit = 500
+        limit = 10000
         idx = 0
         with open(filename) as in_file:
             for line in in_file:
+                idx += 1
                 line = line.rstrip('\n\r')
-                if not m_end_found:
+                if idx < 4:
+                    continue
+                elif idx == 4:
+                    if len(line) != 39 or not(line.endswith(' V2000') or
+                            line.endswith(' V3000')):
+                        return False
+                elif not m_end_found:
                     if line == 'M  END':
                         m_end_found = True
                 elif line == '$$$$':
                     return True
-                idx += 1
                 if idx == limit:
                     break
         return False
diff --git a/lib/galaxy/datatypes/test/chebi_57262.v3k.mol b/lib/galaxy/datatypes/test/chebi_57262.v3k.mol
new file mode 100644
index 000000000000..210b6c49b387
--- /dev/null
+++ b/lib/galaxy/datatypes/test/chebi_57262.v3k.mol
@@ -0,0 +1,55 @@
+CHEBI:57262
+     RDKit          3D
+
+  0  0  0  0  0  0  0  0  0  0999 V3000
+M  V30 BEGIN CTAB
+M  V30 COUNTS 22 21 0 0 0
+M  V30 BEGIN ATOM
+M  V30 1 P 18.2567 -5.6825 0 0
+M  V30 2 O 18.0395 -6.6275 0 0 CHG=-1
+M  V30 3 O 18.255 -4.7415 0 0
+M  V30 4 O 18.9453 -6.0633 0 0
+M  V30 5 O 17.4245 -5.9341 0 0
+M  V30 6 C 15.2886 -5.4953 0 0
+M  V30 7 C 15.9989 -5.9147 0 0
+M  V30 8 C 16.7174 -5.5091 0 0
+M  V30 9 O 14.5702 -5.9009 0 0
+M  V30 10 C 13.8556 -5.4886 0 0
+M  V30 11 O 13.8553 -4.6636 0 0
+M  V30 12 O 15.9909 -6.7396 0 0
+M  V30 13 O 21.0888 -4.8258 0 0
+M  V30 14 C 21.0888 -5.6508 0 0
+M  V30 15 C 20.3743 -6.0633 0 0
+M  V30 16 N 20.3743 -6.8883 0 0 CHG=1
+M  V30 17 C 19.6598 -5.6508 0 0
+M  V30 18 O 21.8033 -6.0633 0 0 CHG=-1
+M  V30 19 R 13.1411 -5.9011 0 0 MASS=1
+M  V30 20 C 15.2764 -7.1521 0 0
+M  V30 21 O 15.2764 -7.9771 0 0
+M  V30 22 R 14.562 -6.7396 0 0 MASS=2
+M  V30 END ATOM
+M  V30 BEGIN BOND
+M  V30 1 1 1 2
+M  V30 2 2 1 3
+M  V30 3 1 1 4
+M  V30 4 1 6 9
+M  V30 5 1 7 6 CFG=3
+M  V30 6 1 8 7
+M  V30 7 1 5 8
+M  V30 8 1 10 9
+M  V30 9 2 10 11
+M  V30 10 1 7 12
+M  V30 11 1 1 5
+M  V30 12 1 15 14 CFG=1
+M  V30 13 2 14 13
+M  V30 14 1 15 16
+M  V30 15 1 15 17
+M  V30 16 1 17 4
+M  V30 17 1 18 14
+M  V30 18 1 10 19
+M  V30 19 1 12 20
+M  V30 20 2 20 21
+M  V30 21 1 20 22
+M  V30 END BOND
+M  V30 END CTAB
+M  END
diff --git a/lib/galaxy/datatypes/test/github88.v3k.sdf b/lib/galaxy/datatypes/test/github88.v3k.sdf
new file mode 100644
index 000000000000..5005395628ff
--- /dev/null
+++ b/lib/galaxy/datatypes/test/github88.v3k.sdf
@@ -0,0 +1,32 @@
+
+  Marvin  06030906502D          
+
+  0  0  0     0  0            999 V3000
+M  V30 BEGIN CTAB
+M  V30 COUNTS 8 8 0 0 0
+M  V30 BEGIN ATOM
+M  V30 1 C 0.7476 -1.4581 0 0
+M  V30 2 C -0.7885 -1.4581 0 0
+M  V30 3 C 1.6627 -2.6967 0 0
+M  V30 4 O 1.6627 -0.2009 0 0
+M  V30 5 O -1.5659 -0.1115 0 0
+M  V30 6 O -1.5659 -2.786 0 0
+M  V30 7 C 3.1431 -2.2244 0 0
+M  V30 8 C 3.1431 -0.6844 0 0
+M  V30 END ATOM
+M  V30 BEGIN BOND
+M  V30 1 1 1 2
+M  V30 2 1 1 3
+M  V30 3 1 1 4
+M  V30 4 1 2 5
+M  V30 5 2 2 6
+M  V30 6 1 3 7
+M  V30 7 1 4 8
+M  V30 8 1 7 8
+M  V30 END BOND
+M  V30 END CTAB
+M  END
+>  <prop1>
+4
+
+$$$$