diff --git a/python/extractor/BUILD.bazel b/python/extractor/BUILD.bazel index 3b5a5b3a6179..697bf8d49a48 100644 --- a/python/extractor/BUILD.bazel +++ b/python/extractor/BUILD.bazel @@ -5,7 +5,6 @@ py_binary( srcs = [ "make_zips.py", "python_tracer.py", - "unparse.py", ], data = [ "LICENSE-PSF.md", diff --git a/python/extractor/licenses.md b/python/extractor/licenses.md index f3a5c6cc458e..d585a84626dc 100644 --- a/python/extractor/licenses.md +++ b/python/extractor/licenses.md @@ -6,7 +6,6 @@ | `tsg-python/tree-sitter-python` | Y | MIT | Used in `tsg-python` to parse Python files | | `tsg-python` | Y | MIT / Apache | This is our own creation, so are free to choose what license it is covered by. | | `tree-sitter-graph` | N | MIT / Apache | Used in `tsg-python` to execute files written in the `tree-sitter-graph` language. | -| `unparse.py` | Y | PSF | Copied and adapted from `Tools/unparse.py` from the `cpython` source code, with attribution. | | `imp.py` | Y | PSF | Copied and adapted from `Lib/imp.py` from the `cpython` source code, with attribution. | | `semmle/data/*.trap` | Y | PSF | These files were derived from the C source code of the `cpython` project, and are used in our modelling of built-in objects. No attribution, currently. | | `semmle/thrift/parse.py` | Y | Apache | Includes a grammar based on https://github.com/apache/thrift/blob/master/doc/specs/idl.md, with comment stating this attribution. | diff --git a/python/extractor/make_zips.py b/python/extractor/make_zips.py index b91b1bf458d8..faf595acc378 100755 --- a/python/extractor/make_zips.py +++ b/python/extractor/make_zips.py @@ -8,7 +8,6 @@ import compileall from python_tracer import getzipfilename -from unparse import strip_comments_and_docstrings # TO DO -- Add options to set destination directory and source directory @@ -84,9 +83,7 @@ def write_source(zipped, root, name, extensions=[".py"]): if ext not in extensions: continue path = os.path.join(dirpath, name) - temp = strip_comments_and_docstrings(path) - zipped.write(temp, os.path.relpath(path, root)) - os.remove(temp) + zipped.write(path, os.path.relpath(path, root)) def main(): parser = optparse.OptionParser(usage = "usage: %prog [install-dir]") diff --git a/python/extractor/tsg-python/Cargo.toml b/python/extractor/tsg-python/Cargo.toml index feecd2541593..ac4d40931895 100644 --- a/python/extractor/tsg-python/Cargo.toml +++ b/python/extractor/tsg-python/Cargo.toml @@ -7,9 +7,14 @@ authors = ["Taus Brock-Nannestad "] edition = "2018" # When changing/updating these, the `Cargo.Bazel.lock` file has to be regenerated. -# Check out the documentation at https://bazelbuild.github.io/rules_rust/crate_universe.html#repinning--updating-dependencies -# for how to do so. The bazel repository for the tsg-python project is called `py_deps`, -# and instead of calling `bazel sync`, `./build --bazel sync` should be used instead, to always use the correct bazel version. +# Run `CARGO_BAZEL_REPIN=true CARGO_BAZEL_REPIN_ONLY=py_deps ./build --bazel sync --only=py_deps` +# in the `semmle-code` repository to do so. +# For more information, check out the documentation at +# https://bazelbuild.github.io/rules_rust/crate_universe.html#repinning--updating-dependencies +# In the future, the hope is to move this handling of the dependencies entirely into the `codeql` repository, +# but that depends on `rules_rust` being fully compatible with bzlmod, which they aren't yet +# (c.f. https://github.com/bazelbuild/rules_rust/issues/2452). +# Warning: The process takes >5min on my M1 mac, so do wait for a while. [dependencies] anyhow = "1.0" regex = "1" diff --git a/python/extractor/tsg-python/tsp/BUILD.bazel b/python/extractor/tsg-python/tsp/BUILD.bazel index 71319e894f6a..e3389fce1ccd 100644 --- a/python/extractor/tsg-python/tsp/BUILD.bazel +++ b/python/extractor/tsg-python/tsp/BUILD.bazel @@ -7,7 +7,7 @@ package(default_visibility = ["//visibility:public"]) # This will run the build script from the root of the workspace, and # collect the outputs. cargo_build_script( - name = "tsg-build", + name = "tsp-build", srcs = ["bindings/rust/build.rs"], data = glob([ "src/**", @@ -32,7 +32,7 @@ rust_library( proc_macro_deps = all_crate_deps( proc_macro = True, ), - deps = [":tsg-build"] + all_crate_deps( + deps = [":tsp-build"] + all_crate_deps( normal = True, ), ) diff --git a/python/extractor/unparse.py b/python/extractor/unparse.py deleted file mode 100644 index b12f1501592d..000000000000 --- a/python/extractor/unparse.py +++ /dev/null @@ -1,709 +0,0 @@ -#Copied Tools.unparse.py with modifications. Copyright PSF. - -"Usage: unparse.py " -import sys -import ast -import tokenize -import io -import os -import shutil - -# Large float and imaginary literals get turned into infinities in the AST. -# We unparse those infinities to INFSTR. -INFSTR = "1e" + repr(sys.float_info.max_10_exp + 1) - -def interleave(inter, f, seq): - """Call f on each item in seq, calling inter() in between. - """ - seq = iter(seq) - try: - f(next(seq)) - except StopIteration: - pass - else: - for x in seq: - inter() - f(x) - -class Unparser: - """Methods in this class recursively traverse an AST and - output source code for the abstract syntax; original formatting - is disregarded. """ - - def __init__(self, tree, file = sys.stdout): - """Unparser(tree, file=sys.stdout) -> None. - Print the source for tree to file.""" - self.f = file - self._indent = 0 - self.dispatch(tree) - print("", file=self.f) - self.f.flush() - - def fill(self, text = ""): - "Indent a piece of text, according to the current indentation level" - self.f.write("\n"+" "*self._indent + text) - - def write(self, text): - "Append a piece of text to the current line." - self.f.write(text) - - def enter(self): - "Print ':', and increase the indentation." - self.write(":") - self._indent += 1 - - def leave(self): - "Decrease the indentation level." - self._indent -= 1 - - def dispatch(self, tree): - "Dispatcher function, dispatching tree type T to method _T." - if isinstance(tree, list): - for t in tree: - self.dispatch(t) - return - meth = getattr(self, "_"+tree.__class__.__name__) - meth(tree) - - def remove_docstring(self, t): - if hasattr(t, "docstring"): - return - if not t.body: - return - if not isinstance(t.body[0], ast.Expr): - return - if not isinstance(t.body[0].value, ast.Str): - return - t.body = t.body[1:] - - def add_pass(self, t): - if t.body: - #No pass needed - return - t.body = [ast.Pass()] - - ############### Unparsing methods ###################### - # There should be one method per concrete grammar type # - # Constructors should be grouped by sum type. Ideally, # - # this would follow the order in the grammar, but # - # currently doesn't. # - ######################################################## - - def _Module(self, tree): - self.remove_docstring(tree) - self.add_pass(tree) - for stmt in tree.body: - self.dispatch(stmt) - - # stmt - def _Expr(self, tree): - self.fill() - self.dispatch(tree.value) - - def _Import(self, t): - self.fill("import ") - interleave(lambda: self.write(", "), self.dispatch, t.names) - - def _ImportFrom(self, t): - self.fill("from ") - self.write("." * t.level) - if t.module: - self.write(t.module) - self.write(" import ") - interleave(lambda: self.write(", "), self.dispatch, t.names) - - def _Assign(self, t): - self.fill() - for target in t.targets: - self.dispatch(target) - self.write(" = ") - self.dispatch(t.value) - - def _AugAssign(self, t): - self.fill() - self.dispatch(t.target) - self.write(" "+self.binop[t.op.__class__.__name__]+"= ") - self.dispatch(t.value) - - def _AnnAssign(self, t): - self.fill() - if not t.simple and isinstance(t.target, ast.Name): - self.write('(') - self.dispatch(t.target) - if not t.simple and isinstance(t.target, ast.Name): - self.write(')') - self.write(": ") - self.dispatch(t.annotation) - if t.value: - self.write(" = ") - self.dispatch(t.value) - - def _Return(self, t): - self.fill("return") - if t.value: - self.write(" ") - self.dispatch(t.value) - - def _Pass(self, t): - self.fill("pass") - - def _Break(self, t): - self.fill("break") - - def _Continue(self, t): - self.fill("continue") - - def _Delete(self, t): - self.fill("del ") - interleave(lambda: self.write(", "), self.dispatch, t.targets) - - def _Assert(self, t): - self.fill("assert ") - self.dispatch(t.test) - if t.msg: - self.write(", ") - self.dispatch(t.msg) - - def _Global(self, t): - self.fill("global ") - interleave(lambda: self.write(", "), self.write, t.names) - - def _Nonlocal(self, t): - self.fill("nonlocal ") - interleave(lambda: self.write(", "), self.write, t.names) - - def _Await(self, t): - self.write("(") - self.write("await") - if t.value: - self.write(" ") - self.dispatch(t.value) - self.write(")") - - def _Yield(self, t): - self.write("(") - self.write("yield") - if t.value: - self.write(" ") - self.dispatch(t.value) - self.write(")") - - def _YieldFrom(self, t): - self.write("(") - self.write("yield from") - if t.value: - self.write(" ") - self.dispatch(t.value) - self.write(")") - - def _Raise(self, t): - self.fill("raise") - if not t.exc: - assert not t.cause - return - self.write(" ") - self.dispatch(t.exc) - if t.cause: - self.write(" from ") - self.dispatch(t.cause) - - def _Try(self, t): - self.fill("try") - self.enter() - self.dispatch(t.body) - self.leave() - for ex in t.handlers: - self.dispatch(ex) - if t.orelse: - self.fill("else") - self.enter() - self.dispatch(t.orelse) - self.leave() - if t.finalbody: - self.fill("finally") - self.enter() - self.dispatch(t.finalbody) - self.leave() - - def _ExceptHandler(self, t): - self.fill("except") - if t.type: - self.write(" ") - self.dispatch(t.type) - if t.name: - self.write(" as ") - self.write(t.name) - self.enter() - self.dispatch(t.body) - self.leave() - - def _ClassDef(self, t): - self.write("\n") - for deco in t.decorator_list: - self.fill("@") - self.dispatch(deco) - self.fill("class "+t.name) - self.write("(") - comma = False - for e in t.bases: - if comma: self.write(", ") - else: comma = True - self.dispatch(e) - for e in t.keywords: - if comma: self.write(", ") - else: comma = True - self.dispatch(e) - self.write(")") - - self.enter() - self.remove_docstring(t) - self.add_pass(t) - self.dispatch(t.body) - self.leave() - - def _FunctionDef(self, t): - self.__FunctionDef_helper(t, "def") - - def _AsyncFunctionDef(self, t): - self.__FunctionDef_helper(t, "async def") - - def __FunctionDef_helper(self, t, fill_suffix): - self.write("\n") - for deco in t.decorator_list: - self.fill("@") - self.dispatch(deco) - def_str = fill_suffix+" "+t.name + "(" - self.fill(def_str) - self.dispatch(t.args) - self.write(")") - if t.returns: - self.write(" -> ") - self.dispatch(t.returns) - self.enter() - self.remove_docstring(t) - self.add_pass(t) - self.dispatch(t.body) - self.leave() - - def _For(self, t): - self.__For_helper("for ", t) - - def _AsyncFor(self, t): - self.__For_helper("async for ", t) - - def __For_helper(self, fill, t): - self.fill(fill) - self.dispatch(t.target) - self.write(" in ") - self.dispatch(t.iter) - self.enter() - self.dispatch(t.body) - self.leave() - if t.orelse: - self.fill("else") - self.enter() - self.dispatch(t.orelse) - self.leave() - - def _If(self, t): - self.fill("if ") - self.dispatch(t.test) - self.enter() - self.dispatch(t.body) - self.leave() - # collapse nested ifs into equivalent elifs. - while (t.orelse and len(t.orelse) == 1 and - isinstance(t.orelse[0], ast.If)): - t = t.orelse[0] - self.fill("elif ") - self.dispatch(t.test) - self.enter() - self.dispatch(t.body) - self.leave() - # final else - if t.orelse: - self.fill("else") - self.enter() - self.dispatch(t.orelse) - self.leave() - - def _While(self, t): - self.fill("while ") - self.dispatch(t.test) - self.enter() - self.dispatch(t.body) - self.leave() - if t.orelse: - self.fill("else") - self.enter() - self.dispatch(t.orelse) - self.leave() - - def _With(self, t): - self.fill("with ") - interleave(lambda: self.write(", "), self.dispatch, t.items) - self.enter() - self.dispatch(t.body) - self.leave() - - def _AsyncWith(self, t): - self.fill("async with ") - interleave(lambda: self.write(", "), self.dispatch, t.items) - self.enter() - self.dispatch(t.body) - self.leave() - - # expr - def _Bytes(self, t): - self.write(repr(t.s)) - - def _Str(self, tree): - s = repr(tree.s).encode("ascii", errors="backslashreplace").decode("ascii") - self.write(s) - - def _JoinedStr(self, t): - self.write("f") - string = io.StringIO() - self._fstring_JoinedStr(t, string.write) - self.write(repr(string.getvalue())) - - def _FormattedValue(self, t): - self.write("f") - string = io.StringIO() - self._fstring_FormattedValue(t, string.write) - self.write(repr(string.getvalue())) - - def _fstring_JoinedStr(self, t, write): - for value in t.values: - meth = getattr(self, "_fstring_" + type(value).__name__) - meth(value, write) - - def _fstring_Str(self, t, write): - value = t.s.replace("{", "{{").replace("}", "}}") - write(value) - - def _fstring_Constant(self, t, write): - assert isinstance(t.value, str) - value = t.value.replace("{", "{{").replace("}", "}}") - write(value) - - def _fstring_FormattedValue(self, t, write): - write("{") - expr = io.StringIO() - Unparser(t.value, expr) - expr = expr.getvalue().rstrip("\n") - if expr.startswith("{"): - write(" ") # Separate pair of opening brackets as "{ {" - write(expr) - if t.conversion != -1: - conversion = chr(t.conversion) - assert conversion in "sra" - write("!%s" % conversion) - if t.format_spec: - write(":") - meth = getattr(self, "_fstring_" + type(t.format_spec).__name__) - meth(t.format_spec, write) - write("}") - - def _Name(self, t): - self.write(t.id) - - def _write_constant(self, value): - if isinstance(value, (float, complex)): - self.write(repr(value).replace("inf", INFSTR)) - else: - self.write(repr(value)) - - def _Constant(self, t): - value = t.value - if isinstance(value, tuple): - self.write("(") - if len(value) == 1: - self._write_constant(value[0]) - self.write(",") - else: - interleave(lambda: self.write(", "), self._write_constant, value) - self.write(")") - else: - self._write_constant(t.value) - - def _NameConstant(self, t): - self.write(repr(t.value)) - - def _Num(self, t): - # Substitute overflowing decimal literal for AST infinities. - self.write(repr(t.n).replace("inf", INFSTR)) - - def _List(self, t): - self.write("[") - interleave(lambda: self.write(", "), self.dispatch, t.elts) - self.write("]") - - def _ListComp(self, t): - self.write("[") - self.dispatch(t.elt) - for gen in t.generators: - self.dispatch(gen) - self.write("]") - - def _GeneratorExp(self, t): - self.write("(") - self.dispatch(t.elt) - for gen in t.generators: - self.dispatch(gen) - self.write(")") - - def _SetComp(self, t): - self.write("{") - self.dispatch(t.elt) - for gen in t.generators: - self.dispatch(gen) - self.write("}") - - def _DictComp(self, t): - self.write("{") - self.dispatch(t.key) - self.write(": ") - self.dispatch(t.value) - for gen in t.generators: - self.dispatch(gen) - self.write("}") - - def _comprehension(self, t): - if hasattr(t, "is_async") and t.is_async: - self.write(" async for ") - else: - self.write(" for ") - self.dispatch(t.target) - self.write(" in ") - self.dispatch(t.iter) - for if_clause in t.ifs: - self.write(" if ") - self.dispatch(if_clause) - - def _IfExp(self, t): - self.write("(") - self.dispatch(t.body) - self.write(" if ") - self.dispatch(t.test) - self.write(" else ") - self.dispatch(t.orelse) - self.write(")") - - def _Set(self, t): - assert(t.elts) # should be at least one element - self.write("{") - interleave(lambda: self.write(", "), self.dispatch, t.elts) - self.write("}") - - def _Dict(self, t): - self.write("{") - def write_key_value_pair(k, v): - self.dispatch(k) - self.write(": ") - self.dispatch(v) - - def write_item(item): - k, v = item - if k is None: - # for dictionary unpacking operator in dicts {**{'y': 2}} - # see PEP 448 for details - self.write("**") - self.dispatch(v) - else: - write_key_value_pair(k, v) - interleave(lambda: self.write(", "), write_item, zip(t.keys, t.values)) - self.write("}") - - def _Tuple(self, t): - self.write("(") - if len(t.elts) == 1: - elt = t.elts[0] - self.dispatch(elt) - self.write(",") - else: - interleave(lambda: self.write(", "), self.dispatch, t.elts) - self.write(")") - - unop = {"Invert":"~", "Not": "not", "UAdd":"+", "USub":"-"} - def _UnaryOp(self, t): - self.write("(") - self.write(self.unop[t.op.__class__.__name__]) - self.write(" ") - self.dispatch(t.operand) - self.write(")") - - binop = { "Add":"+", "Sub":"-", "Mult":"*", "MatMult":"@", "Div":"/", "Mod":"%", - "LShift":"<<", "RShift":">>", "BitOr":"|", "BitXor":"^", "BitAnd":"&", - "FloorDiv":"//", "Pow": "**"} - def _BinOp(self, t): - self.write("(") - self.dispatch(t.left) - self.write(" " + self.binop[t.op.__class__.__name__] + " ") - self.dispatch(t.right) - self.write(")") - - cmpops = {"Eq":"==", "NotEq":"!=", "Lt":"<", "LtE":"<=", "Gt":">", "GtE":">=", - "Is":"is", "IsNot":"is not", "In":"in", "NotIn":"not in"} - def _Compare(self, t): - self.write("(") - self.dispatch(t.left) - for o, e in zip(t.ops, t.comparators): - self.write(" " + self.cmpops[o.__class__.__name__] + " ") - self.dispatch(e) - self.write(")") - - boolops = {ast.And: 'and', ast.Or: 'or'} - def _BoolOp(self, t): - self.write("(") - s = " %s " % self.boolops[t.op.__class__] - interleave(lambda: self.write(s), self.dispatch, t.values) - self.write(")") - - def _Attribute(self,t): - self.dispatch(t.value) - # Special case: 3.__abs__() is a syntax error, so if t.value - # is an integer literal then we need to either parenthesize - # it or add an extra space to get 3 .__abs__(). - if isinstance(t.value, ast.Num) and isinstance(t.value.n, int): - self.write(" ") - self.write(".") - self.write(t.attr) - - def _Call(self, t): - self.dispatch(t.func) - self.write("(") - comma = False - for e in t.args: - if comma: self.write(", ") - else: comma = True - self.dispatch(e) - for e in t.keywords: - if comma: self.write(", ") - else: comma = True - self.dispatch(e) - self.write(")") - - def _Subscript(self, t): - self.dispatch(t.value) - self.write("[") - self.dispatch(t.slice) - self.write("]") - - def _Starred(self, t): - self.write("*") - self.dispatch(t.value) - - # slice - def _Ellipsis(self, t): - self.write("...") - - def _Index(self, t): - self.dispatch(t.value) - - def _Slice(self, t): - if t.lower: - self.dispatch(t.lower) - self.write(":") - if t.upper: - self.dispatch(t.upper) - if t.step: - self.write(":") - self.dispatch(t.step) - - def _ExtSlice(self, t): - interleave(lambda: self.write(', '), self.dispatch, t.dims) - - # argument - def _arg(self, t): - self.write(t.arg) - if t.annotation: - self.write(": ") - self.dispatch(t.annotation) - - # others - def _arguments(self, t): - first = True - # normal arguments - defaults = [None] * (len(t.args) - len(t.defaults)) + t.defaults - for a, d in zip(t.args, defaults): - if first:first = False - else: self.write(", ") - self.dispatch(a) - if d: - self.write("=") - self.dispatch(d) - - # varargs, or bare '*' if no varargs but keyword-only arguments present - if t.vararg or t.kwonlyargs: - if first:first = False - else: self.write(", ") - self.write("*") - if t.vararg: - self.write(t.vararg.arg) - if t.vararg.annotation: - self.write(": ") - self.dispatch(t.vararg.annotation) - - # keyword-only arguments - if t.kwonlyargs: - for a, d in zip(t.kwonlyargs, t.kw_defaults): - if first:first = False - else: self.write(", ") - self.dispatch(a), - if d: - self.write("=") - self.dispatch(d) - - # kwargs - if t.kwarg: - if first:first = False - else: self.write(", ") - self.write("**"+t.kwarg.arg) - if t.kwarg.annotation: - self.write(": ") - self.dispatch(t.kwarg.annotation) - - def _keyword(self, t): - if t.arg is None: - self.write("**") - else: - self.write(t.arg) - self.write("=") - self.dispatch(t.value) - - def _Lambda(self, t): - self.write("(") - self.write("lambda ") - self.dispatch(t.args) - self.write(": ") - self.dispatch(t.body) - self.write(")") - - def _alias(self, t): - self.write(t.name) - if t.asname: - self.write(" as "+t.asname) - - def _withitem(self, t): - self.dispatch(t.context_expr) - if t.optional_vars: - self.write(" as ") - self.dispatch(t.optional_vars) - -def roundtrip(filename, outpath): - with open(filename, "rb") as pyfile: - encoding = tokenize.detect_encoding(pyfile.readline)[0] - with open(filename, "r", encoding=encoding) as pyfile: - source = pyfile.read() - tree = compile(source, filename, "exec", ast.PyCF_ONLY_AST) - with open(outpath, "w", encoding=encoding) as output: - Unparser(tree, output) - -def strip_comments_and_docstrings(path): - tmp = path + ".tmp" - if path.endswith(".py"): - roundtrip(path, tmp) - else: - shutil.copy(path, tmp) - return tmp