refactoring load() to permit overriding file-reading logic (#205)

* replaces the `import_uri` callback with a more general override for reading the WDL souce code * `SourcePosition.filename` is replaced with `SourcePosition.uri` and `SourcePosition.abspath`. The former field is the uri as given to `load()` or in the WDL import statement. The latter field is the absolute filename after resolution of relative uri. * simplify and expose relative import resolution logic
chanzuckerberg · Jul 16, 2019 · 61f58eb · 61f58eb
1 parent 9dacf83
commit 61f58eb
Show file tree

Hide file tree

Showing 7 changed files with 295 additions and 180 deletions.
diff --git a/WDL/CLI.py b/WDL/CLI.py
@@ -117,7 +117,7 @@ def check(uri=None, path=None, check_quant=True, shellcheck=True, **kwargs):
         Lint._shellcheck_available = False
 
     for uri1 in uri or []:
-        doc = load(uri1, path or [], check_quant=check_quant, import_uri=import_uri)
+        doc = load(uri1, path or [], check_quant=check_quant, read_source=read_source)
 
         Lint.lint(doc)
 
@@ -218,9 +218,7 @@ def print_error(exn):
     else:
         if isinstance(getattr(exn, "pos", None), SourcePosition):
             print(
-                "({} Ln {} Col {}) {}".format(
-                    exn.pos.filename, exn.pos.line, exn.pos.column, str(exn)
-                ),
+                "({} Ln {} Col {}) {}".format(exn.pos.uri, exn.pos.line, exn.pos.column, str(exn)),
                 file=sys.stderr,
             )
         else:
@@ -249,10 +247,14 @@ def print_error(exn):
                 quant_warning = True
 
 
-def import_uri(uri):
-    dn = tempfile.mkdtemp(prefix="miniwdl_import_uri_")
-    subprocess.check_call(["wget", "-nv", uri], cwd=dn)
-    return glob.glob(dn + "/*")[0]
+async def read_source(uri, path, importer_uri):
+    if uri.startswith("http:") or uri.startswith("https:"):
+        dn = tempfile.mkdtemp(prefix="miniwdl_import_uri_")
+        subprocess.check_call(["wget", "-nv", uri], cwd=dn)
+        fn = glob.glob(dn + "/*")[0]
+        with open(fn, "r") as infile:
+            return ReadSourceResult(infile.read(), os.path.abspath(fn))
+    return await read_source_default(uri, path, importer_uri)
 
 
 def fill_run_subparser(subparsers):
@@ -301,7 +303,7 @@ def runner(
     uri, inputs, input_file, json_only, empty, check_quant, rundir=None, path=None, **kwargs
 ):
     # load WDL document
-    doc = load(uri, path or [], check_quant=check_quant, import_uri=import_uri)
+    doc = load(uri, path or [], check_quant=check_quant, read_source=read_source)
 
     # validate the provided inputs and prepare Cromwell-style JSON
     target, input_env, input_json = runner_input(doc, inputs, input_file, empty)
@@ -323,7 +325,7 @@ def runner(
     except Error.EvalError as exn:
         print(
             "({} Ln {} Col {}) {}, {}".format(
-                exn.pos.filename, exn.pos.line, exn.pos.column, exn.__class__.__name__, str(exn)
+                exn.pos.uri, exn.pos.line, exn.pos.column, exn.__class__.__name__, str(exn)
             ),
             file=sys.stderr,
         )
@@ -335,7 +337,7 @@ def runner(
             pos = getattr(exn.__cause__, "pos")
             print(
                 "({} Ln {} Col {}) {}, {}".format(
-                    pos.filename,
+                    pos.uri,
                     pos.line,
                     pos.column,
                     exn.__cause__.__class__.__name__,
@@ -364,7 +366,7 @@ def runner_input_completer(prefix, parsed_args, **kwargs):
             argcomplete.warn("file not found: " + uri)
             return []
         try:
-            doc = load(uri, parsed_args.path, parsed_args.check_quant, import_uri=import_uri)
+            doc = load(uri, parsed_args.path, parsed_args.check_quant, read_source=read_source)
         except Exception as exn:
             argcomplete.warn(
                 "unable to load {}; try 'miniwdl check' on it ({})".format(uri, str(exn))
@@ -699,7 +701,7 @@ def cromwell(
     path = path or []
 
     # load WDL document
-    doc = load(uri, path, check_quant=check_quant, import_uri=import_uri)
+    doc = load(uri, path, check_quant=check_quant, read_source=read_source)
 
     # validate the provided inputs and prepare Cromwell-style JSON
     target, _, input_json = runner_input(doc, inputs, input_file, empty)

diff --git a/WDL/Error.py b/WDL/Error.py
@@ -5,11 +5,24 @@
 from . import Type
 
 
-SourcePosition = NamedTuple(
-    "SourcePosition",
-    [("filename", str), ("line", int), ("column", int), ("end_line", int), ("end_column", int)],
-)
-"""Source file, line, and column, attached to each AST node"""
+class SourcePosition(
+    NamedTuple(
+        "SourcePosition",
+        [
+            ("uri", str),
+            ("abspath", str),
+            ("line", int),
+            ("column", int),
+            ("end_line", int),
+            ("end_column", int),
+        ],
+    )
+):
+    """
+    Source position attached to AST nodes and exceptions; NamedTuple of ``uri`` the filename/URI
+    passed to :func:`WDL.load` or a WDL import statement, which may be relative; ``abspath`` the
+    absolute filename/URI; and int positions ``line`` ``end_line`` ``column`` ``end_column``
+    """
 
 
 class SyntaxError(Exception):
@@ -53,13 +66,13 @@ def __init__(self, pos: SourcePosition) -> None:
     def __lt__(self, rhs: TVSourceNode) -> bool:
         if isinstance(rhs, SourceNode):
             return (
-                self.pos.filename,
+                self.pos.abspath,
                 self.pos.line,
                 self.pos.column,
                 self.pos.end_line,
                 self.pos.end_column,
             ) < (
-                rhs.pos.filename,
+                rhs.pos.abspath,
                 rhs.pos.line,
                 rhs.pos.column,
                 rhs.pos.end_line,

diff --git a/WDL/Lint.py b/WDL/Lint.py
@@ -9,7 +9,8 @@
         assert isinstance(pos, WDL.SourcePosition)
         assert isinstance(lint_class, str) and isinstance(message, str)
         print(json.dumps({
-            "filename"   : pos.filename,
+            "uri"        : pos.uri,
+            "abspath"    : pos.abspath,
             "line"       : pos.line,
             "end_line"   : pos.end_line,
             "column"     : pos.column,
@@ -816,7 +817,8 @@ def task(self, obj: Tree.Task) -> Any:
                         obj,
                         "SC{} {}".format(item["code"], item["message"]),
                         Error.SourcePosition(
-                            filename=obj.command.pos.filename,
+                            uri=obj.command.pos.uri,
+                            abspath=obj.command.pos.abspath,
                             line=line,
                             column=column,
                             end_line=line,
@@ -861,7 +863,8 @@ def task(self, obj: Tree.Task) -> Any:
                     obj,
                     "command indented with both spaces & tabs",
                     Error.SourcePosition(
-                        filename=obj.command.pos.filename,
+                        uri=obj.command.pos.uri,
+                        abspath=obj.command.pos.abspath,
                         line=obj.command.pos.line + ofs,
                         column=1,
                         end_line=obj.command.pos.line + ofs,

diff --git a/WDL/Tree.py b/WDL/Tree.py
@@ -12,6 +12,7 @@
 import os
 import errno
 import itertools
+import asyncio
 from typing import (
     Any,
     List,
@@ -24,6 +25,7 @@
     Generator,
     Set,
     NamedTuple,
+    Awaitable,
 )
 from abc import ABC, abstractmethod
 from .Error import SourcePosition, SourceNode
@@ -1255,54 +1257,79 @@ def typecheck(self, check_quant: bool = True) -> None:
             self.workflow.typecheck(self, check_quant=check_quant)
 
 
-def load(
-    uri: str,
-    path: Optional[List[str]] = None,
-    check_quant: bool = True,
-    import_uri: Optional[Callable[[str], str]] = None,
-    import_max_depth: int = 10,
-    source_text: Optional[str] = None,
-) -> Document:
-    path = path or []
-    if source_text is None:
-        if uri.startswith("file://"):
-            uri = uri[7:]
-        elif uri.find("://") > 0 and import_uri:
-            uri = import_uri(uri)
-        # search cwd and path for an extant file
-        fn = next(
+async def resolve_file_import(uri: str, path: List[str], importer: Optional[Document]) -> str:
+    if uri.startswith("http://") or uri.startswith("https://"):
+        # for now we do nothing with web URIs
+        return uri
+    if uri.startswith("file:///"):
+        uri = uri[7:]
+    if os.path.isabs(uri):
+        # given an already-absolute filename, just normalize it
+        ans = os.path.abspath(uri)
+    else:
+        # resolving a relative import: before searching the user-provided path directories, try the
+        # directory of the importing document (if any), or the current working directory
+        # (otherwise)
+        path = path + [os.path.dirname(importer.pos.abspath) if importer else os.getcwd()]
+        ans = next(
             (
                 fn
-                for fn in ([uri] + [os.path.join(dn, uri) for dn in reversed(path)])
-                if os.path.exists(fn)
+                for fn in (os.path.abspath(os.path.join(dn, uri)) for dn in reversed(path))
+                if os.path.isfile(fn)
             ),
             None,
         )
-        if not fn:
-            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), uri)
-        # read the document source text
-        with open(fn, "r") as infile:
-            source_text = infile.read()
-        path = path + [os.path.dirname(fn)]
+    if ans and os.path.isfile(ans):
+        return ans
+    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), uri)
+
+
+ReadSourceResult = NamedTuple("ReadSourceResult", [("source_text", str), ("abspath", str)])
+
+
+async def read_source_default(
+    uri: str, path: List[str], importer: Optional[Document]
+) -> ReadSourceResult:
+    abspath = await resolve_file_import(uri, path, importer)
+    # TODO: actual async read
+    with open(abspath, "r") as infile:
+        return ReadSourceResult(source_text=infile.read(), abspath=abspath)
+
+
+async def load_async(
+    uri: str,
+    path: Optional[List[str]] = None,
+    check_quant: bool = True,
+    read_source: Optional[
+        Callable[[str, List[str], Optional[Document]], Awaitable[ReadSourceResult]]
+    ] = None,
+    import_max_depth: int = 10,
+    importer: Optional[Document] = None,
+) -> Document:
+    path = list(path) if path is not None else []
+    read_source = read_source or read_source_default
+    read_rslt = await read_source(uri, path, importer)
     # parse the document
-    doc = _parser.parse_document(source_text, uri=uri)
-    assert isinstance(doc, Document)
+    doc = _parser.parse_document(read_rslt.source_text, uri=uri, abspath=read_rslt.abspath)
+    assert doc.pos.uri == uri and doc.pos.abspath.endswith(os.path.basename(doc.pos.uri))
     # recursively descend into document's imports, and store the imported
     # documents into doc.imports
     # TODO: are we supposed to do something smart for relative imports
     #       within a document loaded by URI?
+    # TODO: concurrent imports
     for i in range(len(doc.imports)):
         imp = doc.imports[i]
         if import_max_depth <= 1:
             raise Error.ImportError(
                 imp.pos, imp.uri, "exceeded import_max_depth; circular imports?"
             )
         try:
-            subdoc = load(
+            subdoc = await load_async(
                 imp.uri,
-                path,
+                path=path,
                 check_quant=check_quant,
-                import_uri=import_uri,
+                read_source=read_source,
+                importer=doc,
                 import_max_depth=(import_max_depth - 1),
             )
         except Exception as exn:
@@ -1313,16 +1340,38 @@ def load(
     try:
         doc.typecheck(check_quant=check_quant)
     except Error.ValidationError as exn:
-        exn.source_text = source_text
+        exn.source_text = read_rslt.source_text
         raise exn
     except Error.MultipleValidationErrors as multi:
         for exn in multi.exceptions:
             if not exn.source_text:
-                exn.source_text = source_text
+                exn.source_text = read_rslt.source_text
         raise multi
     return doc
 
 
+def load(
+    uri: str,
+    path: Optional[List[str]] = None,
+    check_quant: bool = True,
+    read_source: Optional[
+        Callable[[str, List[str], Optional[Document]], Awaitable[ReadSourceResult]]
+    ] = None,
+    import_max_depth: int = 10,
+    importer: Optional[Document] = None,
+) -> Document:
+    return asyncio.get_event_loop().run_until_complete(
+        load_async(
+            uri,
+            path=path,
+            importer=importer,
+            check_quant=check_quant,
+            read_source=read_source,
+            import_max_depth=import_max_depth,
+        )
+    )
+
+
 #
 # Typechecking helper functions
 #