Merge 7db1690 into fd8ab70

chanzuckerberg · Apr 9, 2020 · 2ab7c4b · 2ab7c4b
2 parents fd8ab70 + 7db1690
commit 2ab7c4b
Show file tree

Hide file tree

Showing 8 changed files with 144 additions and 64 deletions.
diff --git a/WDL/Error.py b/WDL/Error.py
@@ -21,7 +21,8 @@ class SourcePosition(
     """
     Source position attached to AST nodes and exceptions; NamedTuple of ``uri`` the filename/URI
     passed to :func:`WDL.load` or a WDL import statement, which may be relative; ``abspath`` the
-    absolute filename/URI; and int positions ``line`` ``end_line`` ``column`` ``end_column``
+    absolute filename/URI; and one-based int positions ``line`` ``end_line`` ``column``
+    ``end_column``
     """
 
 

diff --git a/WDL/Tree.py b/WDL/Tree.py
@@ -1135,6 +1135,12 @@ def visit(node: SourceNode) -> None:
         return self._nodes_by_id[workflow_node_id]
 
 
+SourceComment = NamedTuple("SourceComment", [("pos", Error.SourcePosition), ("text", str)])
+"""
+Position and text of a comment. The text includes the ``#`` and any preceding or trailing
+spaces/tabs.
+"""
+
 DocImport = NamedTuple(
     "DocImport",
     [
@@ -1146,7 +1152,8 @@ def visit(node: SourceNode) -> None:
     ],
 )
 """
-Represents one imported document, with position of the import statement, import URI, namespace, struct type aliases, and (after typechecking) the ``Document`` object.
+Represents one imported document, with position of the import statement, import URI, namespace,
+struct type aliases, and (after typechecking) the ``Document`` object.
 """
 
 
@@ -1163,6 +1170,23 @@ class Document(SourceNode):
     Original WDL source code text
     """
 
+    source_lines: List[str]
+    """
+    :type: List[str]
+
+    Original WDL source code text split by newlines. ``SourcePosition`` line numbers are
+    one-based, so line number ``L`` corresponds to ``source_lines[L-1]``.
+    """
+
+    source_comments: List[Optional[SourceComment]]
+    """
+    :type: List[Optional[SourceComment]]
+
+    Lookup table for source code comments. ``source_comments`` has the same length as
+    ``source_lines``, and each entry is the :class:`WDL.Tree.SourceComment` found on the
+    corresponding source line, or ``None`` if the line has no comment.
+    """
+
     imports: List[DocImport]
     """
     :type: List[DocImport]
@@ -1183,15 +1207,22 @@ def __init__(
         struct_typedefs: Dict[str, StructTypeDef],
         tasks: List[Task],
         workflow: Optional[Workflow],
+        comments: List[SourceComment],
     ) -> None:
         super().__init__(pos)
-        self.source_text = source_text
         self.imports = imports
         self.struct_typedefs = Env.Bindings()
         for name, struct_typedef in struct_typedefs.items():
             self.struct_typedefs = self.struct_typedefs.bind(name, struct_typedef)
         self.tasks = tasks
         self.workflow = workflow
+        self.source_text = source_text
+        self.source_lines = source_text.split("\n")
+        self.source_comments = [None for _ in self.source_lines]
+        for comment in comments:
+            assert self.source_comments[comment.pos.line - 1] is None
+            assert self.source_lines[comment.pos.line - 1].endswith(comment.text)
+            self.source_comments[comment.pos.line - 1] = comment
 
     @property
     def children(self) -> Iterable[SourceNode]:

diff --git a/WDL/__init__.py b/WDL/__init__.py
@@ -24,6 +24,7 @@
     Document,
     WorkflowNode,
     WorkflowSection,
+    SourceComment,
 )
 from . import runtime
 

diff --git a/WDL/_grammar.py b/WDL/_grammar.py
@@ -89,16 +89,17 @@
 
 
 CNAME: /[a-zA-Z][a-zA-Z0-9_]*/
-COMMENT: "#" /[^\r\n]*/ NEWLINE
+COMMENT: /[ \t]*/ "#" /[^\r\n]*/
+SPACE: /[ \t]+/
 
 %import common.INT
 %import common.SIGNED_INT
 %import common.FLOAT
 %import common.SIGNED_FLOAT
 %import common.ESCAPED_STRING
-%import common.WS
 %import common.NEWLINE
-%ignore WS
+%ignore SPACE
+%ignore NEWLINE
 %ignore COMMENT
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -475,11 +476,12 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 %import common.NEWLINE
-COMMENT: "#" /[^\r\n]*/ NEWLINE
-%ignore COMMENT
+SPACE: /[ \t]+/
+COMMENT: /[ \t]*/ "#" /[^\r\n]*/
 
-%import common.WS
-%ignore WS
+%ignore SPACE
+%ignore NEWLINE
+%ignore COMMENT
 """
 keywords["development"] = set(
     "Array Float Int Map None Pair String alias as call command else false if import input left meta object output parameter_meta right runtime scatter struct task then true workflow".split(

diff --git a/WDL/_parser.py b/WDL/_parser.py
@@ -1,20 +1,31 @@
 # pylint: skip-file
 import inspect
-from typing import List, Optional, Set
+import threading
+from typing import List, Optional, Set, Tuple
 import lark
 from .Error import SourcePosition
 from . import Error, Tree, Type, Expr, _grammar
 
 # memoize Lark parsers constructed for version & start symbol
 _lark_cache = {}
-
-
-def parse(grammar: str, txt: str, start: str) -> lark.Tree:
-    if (grammar, start) not in _lark_cache:
-        _lark_cache[(grammar, start)] = lark.Lark(
-            grammar, start=start, parser="lalr", propagate_positions=True
-        )
-    return _lark_cache[(grammar, start)].parse(txt + ("\n" if not txt.endswith("\n") else ""))
+_lark_comments_buffer = []
+_lark_lock = threading.Lock()
+
+
+def parse(grammar: str, txt: str, start: str) -> Tuple[lark.Tree, List[lark.Token]]:
+    with _lark_lock:
+        if (grammar, start) not in _lark_cache:
+            _lark_cache[(grammar, start)] = lark.Lark(
+                grammar,
+                start=start,
+                parser="lalr",
+                propagate_positions=True,
+                lexer_callbacks={"COMMENT": _lark_comments_buffer.append},
+            )
+        tree = _lark_cache[(grammar, start)].parse(txt + ("\n" if not txt.endswith("\n") else ""))
+        comments = _lark_comments_buffer.copy()
+        _lark_comments_buffer.clear()
+        return (tree, comments)
 
 
 def to_int(x):
@@ -228,11 +239,15 @@ class _DocTransformer(_ExprTransformer, _TypeTransformer):
 
     _keywords: Set[str]
     _source_text: str
+    _comments: List[lark.Token]
 
-    def __init__(self, source_text: str, keywords: Set[str], *args, **kwargs):
+    def __init__(
+        self, source_text: str, keywords: Set[str], comments: List[lark.Token], *args, **kwargs
+    ):
         super().__init__(*args, **kwargs)
         self._source_text = source_text
         self._keywords = keywords
+        self._comments = comments
 
     def _check_keyword(self, pos, name):
         if name in self._keywords:
@@ -499,7 +514,24 @@ def document(self, items, meta):
                 imports.append(item)
             else:
                 assert False
-        return Tree.Document(self._source_text, self._sp(meta), imports, structs, tasks, workflow)
+        comments = [
+            Tree.SourceComment(
+                SourcePosition(
+                    uri=self.uri,
+                    abspath=self.abspath,
+                    line=comment.line,
+                    column=comment.column,
+                    end_line=comment.end_line or comment.line,
+                    end_column=comment.end_column or (comment.column + len(comment.value)),
+                ),
+                text=comment.value,
+            )
+            for comment in self._comments
+        ]
+
+        return Tree.Document(
+            self._source_text, self._sp(meta), imports, structs, tasks, workflow, comments
+        )
 
 
 # have lark pass the 'meta' with line/column numbers to each transformer method
@@ -511,7 +543,7 @@ def document(self, items, meta):
 
 def parse_expr(txt: str, version: Optional[str] = None) -> Expr.Base:
     try:
-        return _ExprTransformer().transform(parse(_grammar.get(version)[0], txt, "expr"))
+        return _ExprTransformer().transform(parse(_grammar.get(version)[0], txt, "expr")[0])
     except lark.exceptions.UnexpectedInput as exn:
         pos = SourcePosition(
             uri="(buffer)",
@@ -529,8 +561,9 @@ def parse_expr(txt: str, version: Optional[str] = None) -> Expr.Base:
 def parse_tasks(txt: str, version: Optional[str] = None) -> List[Tree.Task]:
     try:
         (grammar, keywords) = _grammar.get(version)
-        return _DocTransformer(source_text=txt, keywords=keywords).transform(
-            parse(grammar, txt, "tasks")
+        raw_ast, comments = parse(grammar, txt, "tasks")
+        return _DocTransformer(source_text=txt, keywords=keywords, comments=comments).transform(
+            raw_ast
         )
     except lark.exceptions.VisitError as exn:
         raise exn.__context__
@@ -541,7 +574,7 @@ def parse_document(
 ) -> Tree.Document:
     npos = SourcePosition(uri=uri, abspath=abspath, line=0, column=0, end_line=0, end_column=0)
     if not txt.strip():
-        return Tree.Document(txt, npos, [], {}, [], None,)
+        return Tree.Document(txt, npos, [], {}, [], None, [])
     if version is None:
         # for now assume the version is 1.0 if the first line is "version <number>"
         # otherwise draft-2
@@ -557,9 +590,10 @@ def parse_document(
     except KeyError:
         raise Error.SyntaxError(npos, "unknown WDL version " + version) from None
     try:
+        raw_ast, comments = parse(grammar, txt, "document")
         return _DocTransformer(
-            source_text=txt, uri=uri, abspath=abspath, keywords=keywords
-        ).transform(parse(grammar, txt, "document"))
+            source_text=txt, uri=uri, abspath=abspath, keywords=keywords, comments=comments
+        ).transform(raw_ast)
     except lark.exceptions.UnexpectedInput as exn:
         pos = SourcePosition(
             uri=(uri if uri else "(buffer)"),

diff --git a/examples/paste_wdl_imports.py b/examples/paste_wdl_imports.py
@@ -46,9 +46,6 @@ def main():
     # run SetParents to facilitate getting from a called task to its containing document
     WDL.Walker.SetParents()(doc)
 
-    # original document lines for surgery
-    doc_lines = doc.source_text.split("\n")
-
     # for each call
     tasks_processed = set()
     for call in calls(doc.workflow):
@@ -57,30 +54,30 @@ def main():
             new_task_name = "__".join(call.callee_id)
             assert isinstance(call.callee, WDL.Task), "can't import sub-workflows"
             # rewrite the call with the new task name
-            doc_lines[call.pos.line - 1] = rewrite_line(
-                doc_lines[call.pos.line - 1],
+            doc.source_lines[call.pos.line - 1] = rewrite_line(
+                doc.source_lines[call.pos.line - 1],
                 "call",
                 f"{new_task_name} as {call.name}",
                 old_name="[0-9A-Za-z_\\.]+(\\s+as\\s+[0-9A-Za-z_]+)?",
             )
             if new_task_name not in tasks_processed:
                 task_lines = task_source_lines(call.callee)
                 task_lines[0] = rewrite_line(task_lines[0], "task", new_task_name)
-                doc_lines += ["\n"] + task_lines + ["\n"]
+                doc.source_lines += ["\n"] + task_lines + ["\n"]
                 tasks_processed.add(new_task_name)
 
     # blank out the imports
     for imp in doc.imports:
         for ln in range(imp.pos.line - 1, imp.pos.end_line):
-            doc_lines[ln] = ""
+            doc.source_lines[ln] = ""
 
     # print output
     if args.o:
         with open(args.o, "w") as outfile:
-            for line in doc_lines:
+            for line in doc.source_lines:
                 print(line, file=outfile)
     else:
-        for line in doc_lines:
+        for line in doc.source_lines:
             print(line)
 
 

diff --git a/stubs/lark/__init__.py b/stubs/lark/__init__.py
@@ -2,6 +2,14 @@
 
 from typing import Any
 
+class Token:
+    value: str
+    line: int
+    end_line: int
+    column: int
+    end_column: int
+    ...
+
 class Transformer:
     def transform(self,tree) -> Any:
         ...

diff --git a/tests/test_1doc.py b/tests/test_1doc.py
@@ -502,43 +502,49 @@ def test_unify(self):
 
 class TestDoc(unittest.TestCase):
     def test_count_foo(self):
-        doc = r"""
-        workflow count_lines_matching {
-            call grep
-            call count_lines {
-                input:
-                    in = grep.out
-            }
-        }
-        task grep {
-            File in
-            String pattern
+        doc = r"""#foo
+workflow count_lines_matching {
+    call grep
+    call count_lines {
+        input:
+            in = grep.out  	# bar
+    }
+}
+task grep {
+    File in
+    String pattern
 
-            command {
-                grep ${pattern} ${in} > ans
-            }
+    command {
+        grep ${pattern} ${in} > ans
+    }
 
-            output {
-                File out = "ans"
-            }
-        }
-        task count_lines {
-            File in
+    output {
+        File out = "ans"
+    }
+}
+    #baz   
+task count_lines {
+    File in
 
-            command {
-                wc -l ${in}
-            }
+    command {
+        wc -l ${in}
+    }
 
-            output {
-                Int out = read_int(stdout())
-            }
-        }
-        """
+    output {
+        Int out = read_int(stdout())
+    }
+}
+   #bas 
+"""
         doc = WDL.parse_document(doc)
         self.assertIsInstance(doc.workflow, WDL.Tree.Workflow)
         self.assertEqual(len(doc.workflow.body), 2)
         self.assertEqual(len(doc.tasks), 2)
         doc.typecheck()
+        self.assertEqual(doc.source_comments[0].text, "#foo")
+        self.assertEqual(doc.source_comments[5].text, "  	# bar")
+        self.assertEqual(doc.source_comments[20].text, "    #baz   ")
+        self.assertEqual(doc.source_comments[32].text, "   #bas ")
 
     def test_bam_chrom_counter(self):
         doc = r"""