-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Python: Port URL sanitisation queries to API graphs #5250
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e812eb7
2942a11
cac6c4a
e77c105
af644a0
404649d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,34 +1,39 @@ | ||||||||||||
import python | ||||||||||||
import semmle.python.objects.ObjectInternal | ||||||||||||
deprecated import semmle.python.objects.ObjectInternal as OI | ||||||||||||
private import semmle.python.ApiGraphs | ||||||||||||
|
||||||||||||
private predicate re_module_function(string name, int flags) { | ||||||||||||
name = "compile" and flags = 1 | ||||||||||||
/** | ||||||||||||
* Gets the positional argument index containing the regular expression flags for the member of the | ||||||||||||
* `re` module with the name `name`. | ||||||||||||
*/ | ||||||||||||
private int re_member_flags_arg(string name) { | ||||||||||||
name = "compile" and result = 1 | ||||||||||||
or | ||||||||||||
name = "search" and flags = 2 | ||||||||||||
name = "search" and result = 2 | ||||||||||||
or | ||||||||||||
name = "match" and flags = 2 | ||||||||||||
name = "match" and result = 2 | ||||||||||||
or | ||||||||||||
name = "split" and flags = 3 | ||||||||||||
name = "split" and result = 3 | ||||||||||||
or | ||||||||||||
name = "findall" and flags = 2 | ||||||||||||
name = "findall" and result = 2 | ||||||||||||
or | ||||||||||||
name = "finditer" and flags = 2 | ||||||||||||
name = "finditer" and result = 2 | ||||||||||||
or | ||||||||||||
name = "sub" and flags = 4 | ||||||||||||
name = "sub" and result = 4 | ||||||||||||
or | ||||||||||||
name = "subn" and flags = 4 | ||||||||||||
name = "subn" and result = 4 | ||||||||||||
} | ||||||||||||
|
||||||||||||
/** | ||||||||||||
* Gets the names and corresponding values of attributes of the `re` module that are likely to be | ||||||||||||
* Gets the names and corresponding API nodes of members of the `re` module that are likely to be | ||||||||||||
* methods taking regular expressions as arguments. | ||||||||||||
* | ||||||||||||
* This is a helper predicate that fixes a bad join order, and should not be inlined without checking | ||||||||||||
* that this is safe. | ||||||||||||
*/ | ||||||||||||
pragma[nomagic] | ||||||||||||
private Value relevant_re_attr(string name) { | ||||||||||||
result = Module::named("re").attr(name) and | ||||||||||||
private API::Node relevant_re_member(string name) { | ||||||||||||
result = API::moduleImport("re").getMember(name) and | ||||||||||||
name != "escape" | ||||||||||||
} | ||||||||||||
|
||||||||||||
|
@@ -39,24 +44,78 @@ private Value relevant_re_attr(string name) { | |||||||||||
predicate used_as_regex(Expr s, string mode) { | ||||||||||||
(s instanceof Bytes or s instanceof Unicode) and | ||||||||||||
/* Call to re.xxx(regex, ... [mode]) */ | ||||||||||||
exists(CallNode call, string name | | ||||||||||||
call.getArg(0).pointsTo(_, _, s.getAFlowNode()) and | ||||||||||||
call.getFunction().pointsTo(relevant_re_attr(name)) | ||||||||||||
exists(DataFlow::CallCfgNode call, string name | | ||||||||||||
call.getArg(0).asExpr() = s and | ||||||||||||
call = relevant_re_member(name).getACall() | ||||||||||||
| | ||||||||||||
mode = "None" | ||||||||||||
or | ||||||||||||
exists(Value obj | mode = mode_from_mode_object(obj) | | ||||||||||||
exists(int flags_arg | | ||||||||||||
re_module_function(name, flags_arg) and | ||||||||||||
call.getArg(flags_arg).pointsTo(obj) | ||||||||||||
) | ||||||||||||
or | ||||||||||||
call.getArgByName("flags").pointsTo(obj) | ||||||||||||
mode = mode_from_node([call.getArg(re_member_flags_arg(name)), call.getArgByName("flags")]) | ||||||||||||
) | ||||||||||||
} | ||||||||||||
|
||||||||||||
/** | ||||||||||||
* Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags | ||||||||||||
* that have multiple names, we pick the long-form name as a canonical representative. | ||||||||||||
*/ | ||||||||||||
private string canonical_name(API::Node flag) { | ||||||||||||
result in ["ASCII", "IGNORECASE", "LOCALE", "UNICODE", "MULTILINE", "TEMPLATE"] and | ||||||||||||
flag = API::moduleImport("re").getMember([result, result.prefix(1)]) | ||||||||||||
or | ||||||||||||
flag = API::moduleImport("re").getMember(["DOTALL", "S"]) and result = "DOTALL" | ||||||||||||
or | ||||||||||||
flag = API::moduleImport("re").getMember(["VERBOSE", "X"]) and result = "VERBOSE" | ||||||||||||
} | ||||||||||||
|
||||||||||||
/** | ||||||||||||
* A type tracker for regular expression flag names. Holds if the result is a node that may refer | ||||||||||||
* to the `re` flag with the canonical name `flag_name` | ||||||||||||
*/ | ||||||||||||
private DataFlow::LocalSourceNode re_flag_tracker(string flag_name, DataFlow::TypeTracker t) { | ||||||||||||
t.start() and | ||||||||||||
exists(API::Node flag | flag_name = canonical_name(flag) and result = flag.getAUse()) | ||||||||||||
or | ||||||||||||
exists(BinaryExprNode binop, DataFlow::Node operand | | ||||||||||||
operand.getALocalSource() = re_flag_tracker(flag_name, t.continue()) and | ||||||||||||
Comment on lines
+78
to
+79
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dismissed for the reasons outlined in #5250 (comment) |
||||||||||||
operand.asCfgNode() = binop.getAnOperand() and | ||||||||||||
(binop.getOp() instanceof BitOr or binop.getOp() instanceof Add) and | ||||||||||||
result.asCfgNode() = binop | ||||||||||||
) | ||||||||||||
or | ||||||||||||
// Due to bad performance when using normal setup with `re_flag_tracker(t2, attr_name).track(t2, t)` | ||||||||||||
// we have inlined that code and forced a join | ||||||||||||
exists(DataFlow::TypeTracker t2 | | ||||||||||||
exists(DataFlow::StepSummary summary | | ||||||||||||
re_flag_tracker_first_join(t2, flag_name, result, summary) and | ||||||||||||
t = t2.append(summary) | ||||||||||||
) | ||||||||||||
) | ||||||||||||
} | ||||||||||||
|
||||||||||||
string mode_from_mode_object(Value obj) { | ||||||||||||
pragma[nomagic] | ||||||||||||
private predicate re_flag_tracker_first_join( | ||||||||||||
DataFlow::TypeTracker t2, string flag_name, DataFlow::Node res, DataFlow::StepSummary summary | ||||||||||||
) { | ||||||||||||
DataFlow::StepSummary::step(re_flag_tracker(flag_name, t2), res, summary) | ||||||||||||
} | ||||||||||||
|
||||||||||||
/** | ||||||||||||
* A type tracker for regular expression flag names. Holds if the result is a node that may refer | ||||||||||||
* to the `re` flag with the canonical name `flag_name` | ||||||||||||
*/ | ||||||||||||
private DataFlow::Node re_flag_tracker(string flag_name) { | ||||||||||||
re_flag_tracker(flag_name, DataFlow::TypeTracker::end()).flowsTo(result) | ||||||||||||
} | ||||||||||||
|
||||||||||||
/** Gets a regular expression mode flag associated with the given data flow node. */ | ||||||||||||
string mode_from_node(DataFlow::Node node) { node = re_flag_tracker(result) } | ||||||||||||
|
||||||||||||
/** | ||||||||||||
* DEPRECATED 2021-02-24 -- use `mode_from_node` instead. | ||||||||||||
* | ||||||||||||
* Gets a regular expression mode flag associated with the given value. | ||||||||||||
*/ | ||||||||||||
deprecated string mode_from_mode_object(Value obj) { | ||||||||||||
( | ||||||||||||
result = "DEBUG" or | ||||||||||||
result = "IGNORECASE" or | ||||||||||||
|
@@ -67,8 +126,8 @@ string mode_from_mode_object(Value obj) { | |||||||||||
result = "VERBOSE" | ||||||||||||
) and | ||||||||||||
exists(int flag | | ||||||||||||
flag = Value::named("sre_constants.SRE_FLAG_" + result).(ObjectInternal).intValue() and | ||||||||||||
obj.(ObjectInternal).intValue().bitAnd(flag) = flag | ||||||||||||
flag = Value::named("sre_constants.SRE_FLAG_" + result).(OI::ObjectInternal).intValue() and | ||||||||||||
obj.(OI::ObjectInternal).intValue().bitAnd(flag) = flag | ||||||||||||
) | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I can tell, this does not involve points-to, but please reassure me that this is not code we will be moving away from :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your statement confuses me. The change you have highlighted above does indeed involve points-to, and it is code we will be moving away from. However, it's a predicate that could have seen use outside of this codebase, and so I couldn't simply delete it. I should add a comment saying it's deprecated, though, to make this clear. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have added the indicated statement. (And dated it, so we can actually see when we can delete it. 🙂) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, and I guess that the signature makes it rather awkward to rewrite it to not use points-to... |
||||||||||||
} | ||||||||||||
|
||||||||||||
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have a bit of trouble understanding if
t.continue()
is the only right choice here.I would have thought that if we were able to track a flag with some type tracker
t2
, if that flag is used in a binary or operation, the resulting type-tracker would be the continuation oft2
. SoBut I'm also wondering if we need this continue stuff at all, or we could just use
re_flag_tracker(flag_name)
instead -- and if there would be any bad consequences of that.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Your suggested rewrite would have at least two consequences, both of which may affect performance, and one of which will affect behaviour:
re_flag_tracker/1
, the fixpoint computation now has to evaluate both this andre_flag_tracker/2
at the same time. Currently,re_flag_tracker/1
is simply an extra join on top of the result of the other predicate. The impact on performance probably isn't terribly big, but I imagine there is some overhead in doing this.Finally, rewriting to use
t2
actually doesn't change the behaviour.t = t2.continue()
is equivalent tot = t2 and t.attr = ""
(with a slight abuse of notation). Thus,continue
really just checks that we're not tracking an attribute (which makes sense -- we can addre
flags, but not objects that happen to have anre
flag in an attribute).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍
Right. Although the behavior ends up being the same, for me, it just reads wrong semantically. I would like to use the
t2
approach, and have made a suggestion for that (couldn't do it in this thread, since that only covered one of the lines 🤦)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Everywhere else (in Python and JavaScript) where this construction appears, and I do mean everywhere is in the form
t.continue()
. We only seet2.continue()
(and currently only in JavaScript) when(back)track(t2,t)
is used in an adjacent disjunct.So I think to suddenly change what is the standard idiom doesn't make much sense.
Rather, it seems to me that a better solution would be to add
predicate canContinue
toTypeTracker
, with the meaning thatthis.attr
is empty. Then we can reuset
in both places, as long as we make sure to check... and t.canContinue()
and we don't have to waste precious space on declaring a variable that is exactly equal tot
anyway.This, however, I think it outside the scope of the present PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that changing the way it's used probably isn't in the scope of this PR.