diff --git a/python/ql/src/experimental/dataflow/TypeTracker.qll b/python/ql/src/experimental/dataflow/TypeTracker.qll index 4491279a971e..a5e3493c12d0 100644 --- a/python/ql/src/experimental/dataflow/TypeTracker.qll +++ b/python/ql/src/experimental/dataflow/TypeTracker.qll @@ -6,7 +6,7 @@ private import internal.DataFlowPrivate /** Any string that may appear as the name of an attribute or access path. */ class AttributeName extends string { - AttributeName() { this = any(Attribute a).getName() } + AttributeName() { this = any(AttrRef a).getAttributeName() } } /** Either an attribute name, or the empty string (representing no attribute). */ @@ -115,11 +115,10 @@ predicate returnStep(ReturnNode nodeFrom, Node nodeTo) { * assignment to `z` inside `bar`, even though this attribute write happens _after_ `bar` is called. */ predicate basicStoreStep(Node nodeFrom, Node nodeTo, string attr) { - exists(AttributeAssignment a, Node var | - a.getName() = attr and - simpleLocalFlowStep*(nodeTo, var) and - var.asVar() = a.getInput() and - nodeFrom.asCfgNode() = a.getValue() + exists(AttrWrite a | + a.mayHaveAttributeName(attr) and + nodeFrom = a.getValue() and + simpleLocalFlowStep*(nodeTo, a.getObject()) ) } @@ -127,7 +126,11 @@ predicate basicStoreStep(Node nodeFrom, Node nodeTo, string attr) { * Holds if `nodeTo` is the result of accessing the `attr` attribute of `nodeFrom`. */ predicate basicLoadStep(Node nodeFrom, Node nodeTo, string attr) { - exists(AttrNode s | nodeTo.asCfgNode() = s and s.getObject(attr) = nodeFrom.asCfgNode()) + exists(AttrRead a | + a.mayHaveAttributeName(attr) and + nodeFrom = a.getObject() and + nodeTo = a + ) } /** diff --git a/python/ql/src/experimental/dataflow/internal/Attributes.qll b/python/ql/src/experimental/dataflow/internal/Attributes.qll new file mode 100644 index 000000000000..6ba1f7e9cc07 --- /dev/null +++ b/python/ql/src/experimental/dataflow/internal/Attributes.qll @@ -0,0 +1,256 @@ +/** This module provides an API for attribute reads and writes. */ + +import DataFlowUtil +import DataFlowPublic +private import DataFlowPrivate + +/** + * A data flow node that reads or writes an attribute of an object. + * + * This abstract base class only knows about the base object on which the attribute is being + * accessed, and the attribute itself, if it is statically inferrable. + */ +abstract class AttrRef extends Node { + /** + * Gets the data flow node corresponding to the object whose attribute is being read or written. + */ + abstract Node getObject(); + + /** + * Gets the expression node that defines the attribute being accessed, if any. This is + * usually an identifier or literal. + */ + abstract ExprNode getAttributeNameExpr(); + + /** + * Holds if this attribute reference may access an attribute named `attrName`. + * Uses local data flow to track potential attribute names, which may lead to imprecision. If more + * precision is needed, consider using `getAttributeName` instead. + */ + predicate mayHaveAttributeName(string attrName) { + attrName = this.getAttributeName() + or + exists(Node nodeFrom | + localFlow(nodeFrom, this.getAttributeNameExpr()) and + attrName = nodeFrom.asExpr().(StrConst).getText() + ) + } + + /** + * Gets the name of the attribute being read or written. For dynamic attribute accesses, this + * method is not guaranteed to return a result. For such cases, using `mayHaveAttributeName` may yield + * better results. + */ + abstract string getAttributeName(); +} + +/** + * A data flow node that writes an attribute of an object. This includes + * - Simple attribute writes: `object.attr = value` + * - Dynamic attribute writes: `setattr(object, attr, value)` + * - Fields written during class initialization: `class MyClass: attr = value` + */ +abstract class AttrWrite extends AttrRef { + /** Gets the data flow node corresponding to the value that is written to the attribute. */ + abstract Node getValue(); +} + +/** + * Represents a control flow node for a simple attribute assignment. That is, + * ```python + * object.attr = value + * ``` + * Also gives access to the `value` being written, by extending `DefinitionNode`. + */ +private class AttributeAssignmentNode extends DefinitionNode, AttrNode, DataFlowCfgNode { + override ControlFlowNode getValue() { result = DefinitionNode.super.getValue() } +} + +/** A simple attribute assignment: `object.attr = value`. */ +private class AttributeAssignmentAsAttrWrite extends AttrWrite, CfgNode { + override AttributeAssignmentNode node; + + override Node getValue() { result.asCfgNode() = node.getValue() } + + override Node getObject() { result.asCfgNode() = node.getObject() } + + override ExprNode getAttributeNameExpr() { + // Attribute names don't exist as `Node`s in the control flow graph, as they can only ever be + // identifiers, and are therefore represented directly as strings. + // Use `getAttributeName` to access the name of the attribute. + none() + } + + override string getAttributeName() { result = node.getName() } +} + +import semmle.python.types.Builtins + +/** Represents `CallNode`s that may refer to calls to built-in functions or classes. */ +private class BuiltInCallNode extends CallNode, DataFlowCfgNode { + string name; + + BuiltInCallNode() { + // TODO disallow instances where the name of the built-in may refer to an in-scope variable of that name. + exists(NameNode id | this.getFunction() = id and id.getId() = name and id.isGlobal()) and + name = any(Builtin b).getName() + } + + /** Gets the name of the built-in function that is called at this `CallNode` */ + string getBuiltinName() { result = name } +} + +/** + * Represents a call to the built-ins that handle dynamic inspection and modification of + * attributes: `getattr`, `setattr`, `hasattr`, and `delattr`. + */ +private class BuiltinAttrCallNode extends BuiltInCallNode { + BuiltinAttrCallNode() { name in ["setattr", "getattr", "hasattr", "delattr"] } + + /** Gets the control flow node for object on which the attribute is accessed. */ + ControlFlowNode getObject() { result in [this.getArg(0), this.getArgByName("object")] } + + /** + * Gets the control flow node for the value that is being written to the attribute. + * Only relevant for `setattr` calls. + */ + ControlFlowNode getValue() { + // only valid for `setattr` + name = "setattr" and + result in [this.getArg(2), this.getArgByName("value")] + } + + /** Gets the control flow node that defines the name of the attribute being accessed. */ + ControlFlowNode getName() { result in [this.getArg(1), this.getArgByName("name")] } +} + +/** Represents calls to the built-in `setattr`. */ +private class SetAttrCallNode extends BuiltinAttrCallNode { + SetAttrCallNode() { name = "setattr" } +} + +/** Represents calls to the built-in `getattr`. */ +private class GetAttrCallNode extends BuiltinAttrCallNode { + GetAttrCallNode() { name = "getattr" } +} + +/** An attribute assignment using `setattr`, e.g. `setattr(object, attr, value)` */ +private class SetAttrCallAsAttrWrite extends AttrWrite, CfgNode { + override SetAttrCallNode node; + + override Node getValue() { result.asCfgNode() = node.getValue() } + + override Node getObject() { result.asCfgNode() = node.getObject() } + + override ExprNode getAttributeNameExpr() { result.asCfgNode() = node.getName() } + + override string getAttributeName() { + result = this.getAttributeNameExpr().asExpr().(StrConst).getText() + } +} + +/** + * Represents an attribute of a class that is assigned statically during class definition. For instance + * ```python + * class MyClass: + * attr = value + * ... + * ``` + * Instances of this class correspond to the `NameNode` for `attr`, and also gives access to `value` by + * virtue of being a `DefinitionNode`. + */ +private class ClassAttributeAssignmentNode extends DefinitionNode, NameNode, DataFlowCfgNode { } + +/** + * An attribute assignment via a class field, e.g. + * ```python + * class MyClass: + * attr = value + * ``` + * is treated as equivalent to `MyClass.attr = value`. + */ +private class ClassDefinitionAsAttrWrite extends AttrWrite, CfgNode { + ClassExpr cls; + override ClassAttributeAssignmentNode node; + + ClassDefinitionAsAttrWrite() { node.getScope() = cls.getInnerScope() } + + override Node getValue() { result.asCfgNode() = node.getValue() } + + override Node getObject() { result.asCfgNode() = cls.getAFlowNode() } + + override ExprNode getAttributeNameExpr() { none() } + + override string getAttributeName() { result = node.getId() } +} + +/** + * A read of an attribute on an object. This includes + * - Simple attribute reads: `object.attr` + * - Dynamic attribute reads using `getattr`: `getattr(object, attr)` + * - Qualified imports: `from module import attr as name` + */ +abstract class AttrRead extends AttrRef, Node { } + +/** + * A convenience class for embedding `AttrNode` into `DataFlowCfgNode`, as the former is not + * obviously a subtype of the latter. + */ +private class DataFlowAttrNode extends AttrNode, DataFlowCfgNode { } + +/** A simple attribute read, e.g. `object.attr` */ +private class AttributeReadAsAttrRead extends AttrRead, CfgNode { + override DataFlowAttrNode node; + + override Node getObject() { result.asCfgNode() = node.getObject() } + + override ExprNode getAttributeNameExpr() { + // Attribute names don't exist as `Node`s in the control flow graph, as they can only ever be + // identifiers, and are therefore represented directly as strings. + // Use `getAttributeName` to access the name of the attribute. + none() + } + + override string getAttributeName() { result = node.getName() } +} + +/** An attribute read using `getattr`: `getattr(object, attr)` */ +private class GetAttrCallAsAttrRead extends AttrRead, CfgNode { + override GetAttrCallNode node; + + override Node getObject() { result.asCfgNode() = node.getObject() } + + override ExprNode getAttributeNameExpr() { result.asCfgNode() = node.getName() } + + override string getAttributeName() { + result = this.getAttributeNameExpr().asExpr().(StrConst).getText() + } +} + +/** + * A convenience class for embedding `ImportMemberNode` into `DataFlowCfgNode`, as the former is not + * obviously a subtype of the latter. + */ +private class DataFlowImportMemberNode extends ImportMemberNode, DataFlowCfgNode { } + +/** + * Represents a named import as an attribute read. That is, + * ```python + * from module import attr as attr_ref + * ``` + * is treated as if it is a read of the attribute `module.attr`, even if `module` is not imported directly. + */ +private class ModuleAttributeImportAsAttrRead extends AttrRead, CfgNode { + override DataFlowImportMemberNode node; + + override Node getObject() { result.asCfgNode() = node.getModule(_) } + + override ExprNode getAttributeNameExpr() { + // The name of an imported attribute doesn't exist as a `Node` in the control flow graph, as it + // can only ever be an identifier, and is therefore represented directly as a string. + // Use `getAttributeName` to access the name of the attribute. + none() + } + + override string getAttributeName() { exists(node.getModule(result)) } +} diff --git a/python/ql/src/experimental/dataflow/internal/DataFlowPublic.qll b/python/ql/src/experimental/dataflow/internal/DataFlowPublic.qll index fb68fe3b1f14..9cb9d115b226 100644 --- a/python/ql/src/experimental/dataflow/internal/DataFlowPublic.qll +++ b/python/ql/src/experimental/dataflow/internal/DataFlowPublic.qll @@ -5,6 +5,7 @@ private import python private import DataFlowPrivate import experimental.dataflow.TypeTracker +import Attributes private import semmle.python.essa.SsaCompute /** diff --git a/python/ql/src/experimental/dataflow/internal/DataFlowUtil.qll b/python/ql/src/experimental/dataflow/internal/DataFlowUtil.qll index c868e1762ecc..762ce7fb9218 100644 --- a/python/ql/src/experimental/dataflow/internal/DataFlowUtil.qll +++ b/python/ql/src/experimental/dataflow/internal/DataFlowUtil.qll @@ -18,7 +18,7 @@ predicate localFlowStep(Node nodeFrom, Node nodeTo) { simpleLocalFlowStep(nodeFr predicate localFlow(Node source, Node sink) { localFlowStep*(source, sink) } /** - * Gets an EssaNode that holds the module imported by `name`. + * Gets a `Node` that refers to the module referenced by `name`. * Note that for the statement `import pkg.mod`, the new variable introduced is `pkg` that is a * reference to the module `pkg`. * @@ -27,6 +27,9 @@ predicate localFlow(Node source, Node sink) { localFlowStep*(source, sink) } * 2. `from import ` when ` = + "." + ` * 3. `from import ` when ` = + "." + ` * + * Finally, in `from import ` we consider the `ImportExpr` corresponding to + * `` to be a reference to that module. + * * Note: * While it is technically possible that `import mypkg.foo` and `from mypkg import foo` can give different values, * it's highly unlikely that this will be a problem in production level code. @@ -36,7 +39,7 @@ predicate localFlow(Node source, Node sink) { localFlowStep*(source, sink) } * * Also see `DataFlow::importMember` */ -EssaNode importModule(string name) { +Node importModule(string name) { exists(Variable var, Import imp, Alias alias | alias = imp.getAName() and alias.getAsname() = var.getAStore() and @@ -45,8 +48,29 @@ EssaNode importModule(string name) { or name = alias.getValue().(ImportExpr).getImportedModuleName() ) and - result.getVar().(AssignmentDefinition).getSourceVariable() = var + result.(EssaNode).getVar().(AssignmentDefinition).getSourceVariable() = var ) + or + // Although it may seem superfluous to consider the `foo` part of `from foo import bar as baz` to + // be a reference to a module (since that reference only makes sense locally within the `import` + // statement), it's important for our use of type trackers to consider this local reference to + // also refer to the `foo` module. That way, if one wants to track references to the `bar` + // attribute using a type tracker, one can simply write + // + // ```ql + // DataFlow::Node bar_attr_tracker(TypeTracker t) { + // t.startInAttr("bar") and + // result = foo_module_tracker() + // or + // exists(TypeTracker t2 | result = bar_attr_tracker(t2).track(t2, t)) + // } + // ``` + // + // Where `foo_module_tracker` is a type tracker that tracks references to the `foo` module. + // Because named imports are modelled as `AttrRead`s, the statement `from foo import bar as baz` + // is interpreted as if it was an assignment `baz = foo.bar`, which means `baz` gets tracked as a + // reference to `foo.bar`, as desired. + result.asCfgNode().getNode() = any(ImportExpr i | i.getAnImportedModuleName() = name) } /** diff --git a/python/ql/test/experimental/dataflow/import-helper/ImportHelper.expected b/python/ql/test/experimental/dataflow/import-helper/ImportHelper.expected index 521bb780a56a..2085902c6cfe 100644 --- a/python/ql/test/experimental/dataflow/import-helper/ImportHelper.expected +++ b/python/ql/test/experimental/dataflow/import-helper/ImportHelper.expected @@ -1,16 +1,36 @@ importModule +| test1.py:1:8:1:12 | ControlFlowNode for ImportExpr | mypkg | | test1.py:1:8:1:12 | GSSA Variable mypkg | mypkg | +| test2.py:1:6:1:10 | ControlFlowNode for ImportExpr | mypkg | +| test2.py:1:6:1:10 | ControlFlowNode for ImportExpr | mypkg | | test2.py:1:19:1:21 | GSSA Variable foo | mypkg.foo | | test2.py:1:24:1:26 | GSSA Variable bar | mypkg.bar | +| test3.py:1:8:1:16 | ControlFlowNode for ImportExpr | mypkg | +| test3.py:1:8:1:16 | ControlFlowNode for ImportExpr | mypkg.foo | +| test3.py:2:8:2:16 | ControlFlowNode for ImportExpr | mypkg | +| test3.py:2:8:2:16 | ControlFlowNode for ImportExpr | mypkg.bar | | test3.py:2:8:2:16 | GSSA Variable mypkg | mypkg | +| test4.py:1:8:1:16 | ControlFlowNode for ImportExpr | mypkg | +| test4.py:1:8:1:16 | ControlFlowNode for ImportExpr | mypkg.foo | | test4.py:1:21:1:24 | GSSA Variable _foo | mypkg.foo | +| test4.py:2:8:2:16 | ControlFlowNode for ImportExpr | mypkg | +| test4.py:2:8:2:16 | ControlFlowNode for ImportExpr | mypkg.bar | | test4.py:2:21:2:24 | GSSA Variable _bar | mypkg.bar | +| test5.py:1:8:1:12 | ControlFlowNode for ImportExpr | mypkg | | test5.py:1:8:1:12 | GSSA Variable mypkg | mypkg | +| test5.py:9:6:9:10 | ControlFlowNode for ImportExpr | mypkg | | test5.py:9:26:9:29 | GSSA Variable _bar | mypkg.bar | +| test6.py:1:8:1:12 | ControlFlowNode for ImportExpr | mypkg | | test6.py:1:8:1:12 | GSSA Variable mypkg | mypkg | +| test6.py:5:8:5:16 | ControlFlowNode for ImportExpr | mypkg | +| test6.py:5:8:5:16 | ControlFlowNode for ImportExpr | mypkg.foo | | test6.py:5:8:5:16 | GSSA Variable mypkg | mypkg | +| test7.py:1:6:1:10 | ControlFlowNode for ImportExpr | mypkg | | test7.py:1:19:1:21 | GSSA Variable foo | mypkg.foo | +| test7.py:5:8:5:16 | ControlFlowNode for ImportExpr | mypkg | +| test7.py:5:8:5:16 | ControlFlowNode for ImportExpr | mypkg.foo | | test7.py:5:8:5:16 | GSSA Variable mypkg | mypkg | +| test7.py:9:6:9:10 | ControlFlowNode for ImportExpr | mypkg | | test7.py:9:19:9:21 | GSSA Variable foo | mypkg.foo | importMember | test2.py:1:19:1:21 | GSSA Variable foo | mypkg | foo | diff --git a/python/ql/test/experimental/dataflow/typetracking/attribute_tests.py b/python/ql/test/experimental/dataflow/typetracking/attribute_tests.py index 7d88489842f9..5e8e87f8ae32 100644 --- a/python/ql/test/experimental/dataflow/typetracking/attribute_tests.py +++ b/python/ql/test/experimental/dataflow/typetracking/attribute_tests.py @@ -29,3 +29,73 @@ def test_incompatible_types(): expects_int(x) # $int=field $f+:str=field x.field = str("Hello") # $f+:int=field $str=field $f+:int $str expects_string(x) # $f+:int=field $str=field + + +# Attributes assigned statically to a class + +class MyClass: # $tracked=field + field = tracked # $tracked + +lookup = MyClass.field # $tracked $tracked=field +instance = MyClass() # $tracked=field +lookup2 = instance.field # $f-:tracked + +## Dynamic attribute access + +# Via `getattr`/`setattr` + +def setattr_immediate_write(): + x = SomeClass() # $tracked=foo + setattr(x,"foo", tracked) # $tracked $tracked=foo + y = x.foo # $tracked $tracked=foo + do_stuff(y) # $tracked + +def getattr_immediate_read(): + x = SomeClass() # $tracked=foo + x.foo = tracked # $tracked $tracked=foo + y = getattr(x,"foo") # $tracked $tracked=foo + do_stuff(y) # $tracked + +def setattr_indirect_write(): + attr = "foo" + x = SomeClass() # $tracked=foo + setattr(x, attr, tracked) # $tracked $tracked=foo + y = x.foo # $tracked $tracked=foo + do_stuff(y) # $tracked + +def getattr_indirect_read(): + attr = "foo" + x = SomeClass() # $tracked=foo + x.foo = tracked # $tracked $tracked=foo + y = getattr(x, attr) #$tracked $tracked=foo + do_stuff(y) # $tracked + +# Via `__dict__` -- not currently implemented. + +def dunder_dict_immediate_write(): + x = SomeClass() # $f-:tracked=foo + x.__dict__["foo"] = tracked # $tracked $f-:tracked=foo + y = x.foo # $f-:tracked $f-:tracked=foo + do_stuff(y) # $f-:tracked + +def dunder_dict_immediate_read(): + x = SomeClass() # $tracked=foo + x.foo = tracked # $tracked $tracked=foo + y = x.__dict__["foo"] # $f-:tracked $tracked=foo + do_stuff(y) # $f-:tracked + +def dunder_dict_indirect_write(): + attr = "foo" + x = SomeClass() # $f-:tracked=foo + x.__dict__[attr] = tracked # $tracked $f-:tracked=foo + y = x.foo # $f-:tracked $f-:tracked=foo + do_stuff(y) # $f-:tracked + +def dunder_dict_indirect_read(): + attr = "foo" + x = SomeClass() # $tracked=foo + x.foo = tracked # $tracked $tracked=foo + y = x.__dict__[attr] # $f-:tracked $tracked=foo + do_stuff(y) # $f-:tracked + + diff --git a/python/ql/test/experimental/dataflow/typetracking/import_as_attr.py b/python/ql/test/experimental/dataflow/typetracking/import_as_attr.py new file mode 100644 index 000000000000..1e2085f126dc --- /dev/null +++ b/python/ql/test/experimental/dataflow/typetracking/import_as_attr.py @@ -0,0 +1,9 @@ +from module import attr as attr_ref + +x = attr_ref + +def fun(): + y = attr_ref + +# The following should _not_ be a reference to the above module, since we don't actually import it. +z = module diff --git a/python/ql/test/experimental/dataflow/typetracking/moduleattr.expected b/python/ql/test/experimental/dataflow/typetracking/moduleattr.expected new file mode 100644 index 000000000000..adfc8c5a3795 --- /dev/null +++ b/python/ql/test/experimental/dataflow/typetracking/moduleattr.expected @@ -0,0 +1,10 @@ +module_tracker +| import_as_attr.py:1:6:1:11 | ControlFlowNode for ImportExpr | +module_attr_tracker +| import_as_attr.py:0:0:0:0 | ModuleVariableNode for Global Variable attr_ref in Module import_as_attr | +| import_as_attr.py:1:20:1:35 | ControlFlowNode for ImportMember | +| import_as_attr.py:1:28:1:35 | GSSA Variable attr_ref | +| import_as_attr.py:3:1:3:1 | GSSA Variable x | +| import_as_attr.py:3:5:3:12 | ControlFlowNode for attr_ref | +| import_as_attr.py:6:5:6:5 | SSA variable y | +| import_as_attr.py:6:9:6:16 | ControlFlowNode for attr_ref | diff --git a/python/ql/test/experimental/dataflow/typetracking/moduleattr.ql b/python/ql/test/experimental/dataflow/typetracking/moduleattr.ql new file mode 100644 index 000000000000..15616d918609 --- /dev/null +++ b/python/ql/test/experimental/dataflow/typetracking/moduleattr.ql @@ -0,0 +1,23 @@ +import python +import experimental.dataflow.DataFlow +import experimental.dataflow.TypeTracker + +DataFlow::Node module_tracker(TypeTracker t) { + t.start() and + result = DataFlow::importModule("module") + or + exists(TypeTracker t2 | result = module_tracker(t2).track(t2, t)) +} + +query DataFlow::Node module_tracker() { result = module_tracker(DataFlow::TypeTracker::end()) } + +DataFlow::Node module_attr_tracker(TypeTracker t) { + t.startInAttr("attr") and + result = module_tracker() + or + exists(TypeTracker t2 | result = module_attr_tracker(t2).track(t2, t)) +} + +query DataFlow::Node module_attr_tracker() { + result = module_attr_tracker(DataFlow::TypeTracker::end()) +}