Skip to content

Commit

Permalink
Optimize SubstringSetMatcher [patch 1/5, Build]
Browse files Browse the repository at this point in the history
Do all work in a Build() member function, instead of in the constructor.
This allows it to fail (without exceptions); right now, this can de
facto never happen, but as we will reduce the maximum number of nodes to
2^23 in a later patch in this series, and start building these trees based on
user-provided data, we need to be able to signal failure a little more
gracefully.

Change-Id: Icda6a3287c96410cf98a6204b4dc5067104fff8c
Bug: 1319422
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3596139
Reviewed-by: Devlin Cronin <rdevlin.cronin@chromium.org>
Reviewed-by: Dominic Battré <battre@chromium.org>
Commit-Queue: Steinar H Gunderson <sesse@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1001423}
  • Loading branch information
Steinar H. Gunderson authored and Chromium LUCI CQ committed May 10, 2022
1 parent 25ac5db commit 45e5abb
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 39 deletions.
5 changes: 3 additions & 2 deletions components/url_matcher/regex_set_matcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ void RegexSetMatcher::RebuildMatcher() {
for (size_t i = 0; i < strings_to_match.size(); ++i)
substring_patterns.emplace_back(std::move(strings_to_match[i]), i);

substring_matcher_ =
std::make_unique<SubstringSetMatcher>(substring_patterns);
substring_matcher_ = std::make_unique<SubstringSetMatcher>();
bool success = substring_matcher_->Build(substring_patterns);
CHECK(success);
}

} // namespace url_matcher
27 changes: 20 additions & 7 deletions components/url_matcher/substring_set_matcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,11 @@ std::vector<const StringPattern*> GetVectorOfPointers(

} // namespace

SubstringSetMatcher::SubstringSetMatcher(
const std::vector<StringPattern>& patterns)
: SubstringSetMatcher(GetVectorOfPointers(patterns)) {}
bool SubstringSetMatcher::Build(const std::vector<StringPattern>& patterns) {
return Build(GetVectorOfPointers(patterns));
}

SubstringSetMatcher::SubstringSetMatcher(
std::vector<const StringPattern*> patterns) {
bool SubstringSetMatcher::Build(std::vector<const StringPattern*> patterns) {
// Ensure there are no duplicate IDs and all pattern strings are distinct.
#if DCHECK_IS_ON()
{
Expand All @@ -57,8 +56,20 @@ SubstringSetMatcher::SubstringSetMatcher(
}
#endif

// Check that all the match labels fit into an edge.
for (const StringPattern* pattern : patterns) {
if (pattern->id() < 0 ||
base::checked_cast<NodeID>(pattern->id()) >= kInvalidNodeID) {
return false;
}
}

// Compute the total number of tree nodes needed.
std::sort(patterns.begin(), patterns.end(), ComparePatterns);
NodeID tree_size = GetTreeSize(patterns);
if (tree_size >= kInvalidNodeID) {
return false;
}
tree_.reserve(GetTreeSize(patterns));
BuildAhoCorasickTree(patterns);

Expand All @@ -67,6 +78,7 @@ SubstringSetMatcher::SubstringSetMatcher(
DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));

is_empty_ = patterns.empty() && tree_.size() == 1u;
return true;
}

SubstringSetMatcher::~SubstringSetMatcher() = default;
Expand Down Expand Up @@ -290,8 +302,9 @@ SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
default;

SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode::
operator=(AhoCorasickNode&& other) = default;
SubstringSetMatcher::AhoCorasickNode&
SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) =
default;

SubstringSetMatcher::NodeID SubstringSetMatcher::AhoCorasickNode::GetEdge(
char c) const {
Expand Down
25 changes: 15 additions & 10 deletions components/url_matcher/substring_set_matcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,27 @@ namespace url_matcher {
// which string patterns occur in S.
class URL_MATCHER_EXPORT SubstringSetMatcher {
public:
SubstringSetMatcher() = default;
SubstringSetMatcher(const SubstringSetMatcher&) = delete;
SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;

~SubstringSetMatcher();

// Registers all |patterns|. Each pattern needs to have a unique ID and all
// pattern strings must be unique.
// pattern strings must be unique. Build() should be called exactly once
// (before it is called, the tree is empty).
//
// Complexity:
// Let n = number of patterns.
// Let S = sum of pattern lengths.
// Let k = range of char. Generally 256.
// Complexity = O(nlogn + S * logk)
// nlogn comes from sorting the patterns.
// log(k) comes from our usage of std::map to store edges.
SubstringSetMatcher(const std::vector<StringPattern>& patterns);
SubstringSetMatcher(std::vector<const StringPattern*> patterns);

SubstringSetMatcher(const SubstringSetMatcher&) = delete;
SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;

~SubstringSetMatcher();
//
// Returns true on success (may fail if e.g. if the tree gets too many nodes).
bool Build(const std::vector<StringPattern>& patterns);
bool Build(std::vector<const StringPattern*> patterns);

// Matches |text| against all registered StringPatterns. Stores the IDs
// of matching patterns in |matches|. |matches| is not cleared before adding
Expand All @@ -60,8 +65,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
private:
// Represents the index of the node within |tree_|. It is specifically
// uint32_t so that we can be sure it takes up 4 bytes. If the computed size
// of |tree_| is larger than what can be stored within an uint32_t, there will
// be a CHECK failure.
// of |tree_| is larger than what can be stored within an uint32_t,
// Build() will fail.
using NodeID = uint32_t;

// This is the maximum possible size of |tree_| and hence can't be a valid ID.
Expand Down
3 changes: 2 additions & 1 deletion components/url_matcher/substring_set_matcher_perftest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ TEST(SubstringSetMatcherPerfTest, RandomKeys) {

// Allocate SubstringSetMatcher on the heap so that EstimateMemoryUsage below
// also includes its stack allocated memory.
auto matcher = std::make_unique<SubstringSetMatcher>(patterns);
auto matcher = std::make_unique<SubstringSetMatcher>();
ASSERT_TRUE(matcher->Build(patterns));
base::TimeDelta init_time = init_timer.Elapsed();

// Match patterns against a random string of 500 characters.
Expand Down
49 changes: 35 additions & 14 deletions components/url_matcher/substring_set_matcher_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ namespace {
void TestOnePattern(const std::string& test_string,
const std::string& pattern,
bool is_match) {
std::string test =
"TestOnePattern(" + test_string + ", " + pattern + ", " +
(is_match ? "1" : "0") + ")";
std::string test = "TestOnePattern(" + test_string + ", " + pattern + ", " +
(is_match ? "1" : "0") + ")";
std::vector<StringPattern> patterns;
patterns.emplace_back(pattern, 1);
SubstringSetMatcher matcher(patterns);
SubstringSetMatcher matcher;
ASSERT_TRUE(matcher.Build(patterns));
std::set<int> matches;
matcher.Match(test_string, &matches);

Expand All @@ -39,9 +39,9 @@ void TestTwoPatterns(const std::string& test_string,
const std::string& pattern_2,
bool is_match_1,
bool is_match_2) {
std::string test =
"TestTwoPatterns(" + test_string + ", " + pattern_1 + ", " + pattern_2 +
", " + (is_match_1 ? "1" : "0") + ", " + (is_match_2 ? "1" : "0") + ")";
std::string test = "TestTwoPatterns(" + test_string + ", " + pattern_1 +
", " + pattern_2 + ", " + (is_match_1 ? "1" : "0") + ", " +
(is_match_2 ? "1" : "0") + ")";
ASSERT_NE(pattern_1, pattern_2);
StringPattern substring_pattern_1(pattern_1, 1);
StringPattern substring_pattern_2(pattern_2, 2);
Expand All @@ -56,7 +56,8 @@ void TestTwoPatterns(const std::string& test_string,
patterns.push_back(&substring_pattern_2);
patterns.push_back(&substring_pattern_1);
}
SubstringSetMatcher matcher(patterns);
SubstringSetMatcher matcher;
ASSERT_TRUE(matcher.Build(patterns));
std::set<int> matches;
matcher.Match(test_string, &matches);

Expand All @@ -75,45 +76,61 @@ TEST(SubstringSetMatcherTest, TestMatcher) {
// Pattern 1 bc
// Pattern 2 cd
TestTwoPatterns("abcde", "bc", "cd", true, true);
if (HasFatalFailure())
return;

// Test subpatterns - part 1
// String abcde
// Pattern 1 bc
// Pattern 2 b
TestTwoPatterns("abcde", "bc", "b", true, true);
if (HasFatalFailure())
return;

// Test subpatterns - part 2
// String abcde
// Pattern 1 bc
// Pattern 2 c
TestTwoPatterns("abcde", "bc", "c", true, true);
if (HasFatalFailure())
return;

// Test identical matches
// String abcde
// Pattern 1 abcde
TestOnePattern("abcde", "abcde", true);
if (HasFatalFailure())
return;

// Test multiple matches
// String aaaaa
// Pattern 1 a
TestOnePattern("abcde", "a", true);
if (HasFatalFailure())
return;

// Test matches at beginning and end
// String abcde
// Pattern 1 ab
// Pattern 2 de
TestTwoPatterns("abcde", "ab", "de", true, true);
if (HasFatalFailure())
return;

// Test non-match
// String abcde
// Pattern 1 fg
TestOnePattern("abcde", "fg", false);
if (HasFatalFailure())
return;

// Test empty pattern and too long pattern
// String abcde
// Pattern 1
// Pattern 2 abcdef
TestTwoPatterns("abcde", std::string(), "abcdef", true, false);
if (HasFatalFailure())
return;
}

TEST(SubstringSetMatcherTest, TestMatcher2) {
Expand All @@ -123,7 +140,8 @@ TEST(SubstringSetMatcherTest, TestMatcher2) {

std::vector<const StringPattern*> patterns = {&pattern_1, &pattern_2,
&pattern_3};
auto matcher = std::make_unique<SubstringSetMatcher>(patterns);
auto matcher = std::make_unique<SubstringSetMatcher>();
ASSERT_TRUE(matcher->Build(patterns));

std::set<int> matches;
matcher->Match("abd", &matches);
Expand All @@ -132,16 +150,17 @@ TEST(SubstringSetMatcherTest, TestMatcher2) {
EXPECT_TRUE(matches.end() != matches.find(2));

patterns = {&pattern_1, &pattern_3};
matcher = std::make_unique<SubstringSetMatcher>(patterns);
matcher = std::make_unique<SubstringSetMatcher>();
ASSERT_TRUE(matcher->Build(patterns));

matches.clear();
matcher->Match("abd", &matches);
EXPECT_EQ(1u, matches.size());
EXPECT_TRUE(matches.end() != matches.find(1));
EXPECT_TRUE(matches.end() == matches.find(2));

matcher = std::make_unique<SubstringSetMatcher>(
std::vector<const StringPattern*>());
matcher = std::make_unique<SubstringSetMatcher>();
ASSERT_TRUE(matcher->Build(std::vector<const StringPattern*>()));
EXPECT_TRUE(matcher->IsEmpty());
}

Expand All @@ -158,7 +177,8 @@ TEST(SubstringSetMatcherTest, TestMatcher3) {
}
}

SubstringSetMatcher matcher(patterns);
SubstringSetMatcher matcher;
matcher.Build(patterns);
std::set<int> matches;
matcher.Match(text, &matches);
EXPECT_EQ(patterns.size(), matches.size());
Expand All @@ -170,7 +190,8 @@ TEST(SubstringSetMatcherTest, TestMatcher3) {

TEST(SubstringSetMatcherTest, TestEmptyMatcher) {
std::vector<StringPattern> patterns;
SubstringSetMatcher matcher(patterns);
SubstringSetMatcher matcher;
matcher.Build(patterns);
std::set<int> matches;
matcher.Match("abd", &matches);
EXPECT_TRUE(matches.empty());
Expand Down
7 changes: 4 additions & 3 deletions components/url_matcher/url_matcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -930,9 +930,10 @@ void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
std::unique_ptr<SubstringSetMatcher>& url_matcher =
full_url_conditions ? full_url_matcher_ : url_component_matcher_;

url_matcher =
std::make_unique<SubstringSetMatcher>(std::vector<const StringPattern*>(
new_patterns.begin(), new_patterns.end()));
url_matcher = std::make_unique<SubstringSetMatcher>();
bool success = url_matcher->Build(std::vector<const StringPattern*>(
new_patterns.begin(), new_patterns.end()));
CHECK(success);
}

void URLMatcher::UpdateRegexSetMatcher() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,12 @@ void RegexRulesMatcher::InitializeMatcher() {
for (size_t i = 0; i < strings_to_match.size(); ++i)
patterns.emplace_back(std::move(strings_to_match[i]), i);

substring_matcher_ =
std::make_unique<url_matcher::SubstringSetMatcher>(patterns);
substring_matcher_ = std::make_unique<url_matcher::SubstringSetMatcher>();

// This is only used for regex rules, which are limited to 1000,
// so hitting the 8MB limit should be all but impossible.
bool success = substring_matcher_->Build(patterns);
CHECK(success);
}

bool RegexRulesMatcher::IsEmpty() const {
Expand Down

0 comments on commit 45e5abb

Please sign in to comment.