Reduce task digest sensitivity to comments & whitespace

chanzuckerberg · Jul 24, 2020 · d300543 · d300543
1 parent 36f442e
commit d300543
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 14 deletions.
diff --git a/WDL/runtime/cache.py b/WDL/runtime/cache.py
@@ -5,6 +5,7 @@
 """
 import hashlib
 import json
+import itertools
 import os
 import logging
 from pathlib import Path
@@ -223,9 +224,11 @@ def _describe_task(doc, task: Tree.Task) -> str:
         output_lines.append(f"# {struct_name} :: {structs[struct_name]}")
 
     # excerpt task{} from document
-    output_lines += _excerpt(doc, task.pos)
-
-    # TODO (?): delete non-semantic whitespace, perhaps excise the meta & parameter_meta sections
+    # Possible future improvements:
+    # excise the meta & parameter_meta sections
+    # normalize order of declarations
+    # normalize whitespace within lines (not leading/trailing)
+    output_lines += _excerpt(doc, task.pos, [task.command.pos])
 
     return "\n".join(output_lines).strip()
 
@@ -250,15 +253,38 @@ def _describe_struct_types(task: Tree.Task) -> Dict[str, str]:
     return structs
 
 
-def _excerpt(doc: Tree.Document, pos: Error.SourcePosition) -> List[str]:
+def _excerpt(
+    doc: Tree.Document, pos: Error.SourcePosition, literals: List[Error.SourcePosition]
+) -> List[str]:
     """
-    Excerpt the document's source lines indicated by pos : WDL.SourcePosition
-    TODO (?): delete comments from the source lines
+    Excerpt the document's source lines indicated by pos : WDL.SourcePosition. Delete comments,
+    blank lines, and leading/trailing whitespace from each line -- except those indicated by
+    literals.
     """
+
+    def clean(line: int, column: int = 1, end_column: Optional[int] = None) -> List[str]:
+        literal = next(
+            (True for lit in literals if line >= lit.line and line <= lit.end_line), False
+        )
+        comment = doc.source_comments[line - 1]
+        if comment and not literal:
+            assert comment.pos.line == line
+            if end_column is None:
+                end_column = comment.pos.column
+            else:
+                end_column = min(end_column, comment.pos.column)
+        txt = doc.source_lines[line-1][(column - 1) : end_column]
+        if literal:
+            return [txt]
+        txt = txt.strip()
+        return [txt] if txt else []
+
     if pos.end_line == pos.line:
-        return [doc.source_lines[pos.line - 1][(pos.column - 1) : pos.end_column]]
-    return (
-        [doc.source_lines[pos.line - 1][(pos.column - 1) :]]
-        + doc.source_lines[pos.line : (pos.end_line - 1)]
-        + [doc.source_lines[pos.end_line - 1][: pos.end_column]]
+        return clean(pos.line, pos.column, pos.end_column)
+    return list(
+        itertools.chain(
+            clean(pos.line, pos.column),
+            *(clean(line_nr) for line_nr in range(pos.line+1, pos.end_line)),
+            clean(pos.end_line, 0, pos.end_column),
+        )
     )
diff --git a/tests/test_8cache.py b/tests/test_8cache.py
@@ -15,16 +15,22 @@ class TestTaskRunner(unittest.TestCase):
     test_wdl: str = R"""
         version 1.0
         task hello_blank {
-            input {
-                String who
+            # comment1
+            input  {
+                String who     # comment2
                 Array[String]? what
+
                 Map[String,Map[String,String]]? where
-            }
+            }     
+
+
             command <<<
+                # comment3
                 echo "Hello, ~{who}!"
             >>>
             output {
                 Int count = 12
+
             }
         }
         """
@@ -123,6 +129,26 @@ def test_input_digest_sorts_keys(self):
         unordered_digest = CallCache(cfg=self.cfg, logger=self.logger).get_digest_for_inputs(unordered_inputs)
         self.assertEqual(ordered_digest, unordered_digest)
 
+    def test_normalization(self):
+        desc = WDL.runtime.cache._describe_task(self.doc, self.doc.tasks[0])
+        self.assertEqual(desc, R"""
+version 1.0
+task hello_blank {
+input  {
+String who
+Array[String]? what
+Map[String,Map[String,String]]? where
+}
+            command <<<
+                # comment3
+                echo "Hello, ~{who}!"
+            >>>
+output {
+Int count = 12
+}
+}
+        """.strip())
+
     def test_task_input_cache_matches_output(self):
         # run task, check output matches what was stored in run_dir
         cache = CallCache(cfg=self.cfg, logger=self.logger)