Skip to content

Commit

Permalink
Optimize SubstringSetMatcher [patch 4/5, aux info in edge map]
Browse files Browse the repository at this point in the history
Store some auxiliary node information (failure edge, match ID,
output link) together with the edges, making the edge map into a
slightly more generic key/value store. This again saves significantly
on RAM, since we do not need to store these values on nodes that
do not have them (e.g. very few nodes have a match ID).

We lose some of the initialization performance improvements we got
in the previous patch, but still keep a healthy lead overall.

SubstringSetMatcher.init_time:      36772 -> 45214 us (-18.7% perf)
SubstringSetMatcher.match_time:       129 ->   128 us (+ 0.8% perf)
SubstringSetMatcher.memory_usage:   13047 ->  6649 kB (-49.0% RAM)

Change-Id: Iae78ca2f14afb542cb8651db2e375154a9058238
Bug: 1319422
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3596142
Commit-Queue: Steinar H Gunderson <sesse@chromium.org>
Reviewed-by: Dominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1001477}
  • Loading branch information
Steinar H. Gunderson authored and Chromium LUCI CQ committed May 10, 2022
1 parent e953ea9 commit bc88e55
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 41 deletions.
38 changes: 24 additions & 14 deletions components/url_matcher/substring_set_matcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -246,21 +246,17 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() {
// Initialize the failure edges for |root| and its children.
AhoCorasickNode* const root = &tree_[0];

// Assigning |root| as the failure edge for itself doesn't strictly abide by
// the definition of "proper" suffix. The proper suffix of an empty string
// should probably be defined as null, but we assign it to the |root| to
// simplify the code and have the invariant that the failure edge is always
// defined.
root->SetFailure(kRootID);

root->SetOutputLink(kInvalidNodeID);

NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;

for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) {
const AhoCorasickEdge& edge = root->edges()[edge_idx];
if (edge.label >= kFirstSpecialLabel) {
continue;
}
AhoCorasickNode* child = &tree_[edge.node_id];
child->SetFailure(kRootID);
// Failure node is kept as the root.
child->SetOutputLink(root_output_link);
queue.push(child);
}
Expand All @@ -277,6 +273,9 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() {
for (unsigned edge_idx = 0; edge_idx < current_node->num_edges();
++edge_idx) {
const AhoCorasickEdge& edge = current_node->edges()[edge_idx];
if (edge.label >= kFirstSpecialLabel) {
continue;
}
AhoCorasickNode* child = &tree_[edge.node_id];

const AhoCorasickNode* failure_candidate_parent =
Expand All @@ -295,10 +294,10 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() {
// have reached the root. Hence the longest proper suffix of this string
// represented by this node is the empty string (represented by root).
failure_candidate_id = kRootID;
} else {
child->SetFailure(failure_candidate_id);
}

child->SetFailure(failure_candidate_id);

const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
// Now |failure_candidate| is |child|'s longest possible proper suffix in
// the trie. We also know that since we are doing a breadth first search,
Expand All @@ -318,6 +317,10 @@ void SubstringSetMatcher::AccumulateMatchesForNode(
std::set<StringPattern::ID>* matches) const {
DCHECK(matches);

if (!node->has_outputs()) {
// Fast reject.
return;
}
if (node->IsEndOfPattern())
matches->insert(node->GetMatchID());

Expand Down Expand Up @@ -362,9 +365,6 @@ SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) {
}
num_free_edges_ = other.num_free_edges_;
edges_capacity_ = other.edges_capacity_;
failure_ = other.failure_;
match_id_ = other.match_id_;
output_link_ = other.output_link_;
return *this;
}

Expand Down Expand Up @@ -393,6 +393,10 @@ void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label,
if (edges_capacity_ == 0 && num_free_edges_ > 0) {
// Still space in the inline storage, so use that.
edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node};
if (label == kFailureNodeLabel) {
// Make sure that kFailureNodeLabel is first.
std::swap(edges_.inline_edges[0], edges_.inline_edges[num_edges()]);
}
--num_free_edges_;
return;
}
Expand Down Expand Up @@ -420,12 +424,18 @@ void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label,

// Insert the new edge at the end of our heap storage.
edges_.edges[num_edges()] = AhoCorasickEdge{label, node};
if (label == kFailureNodeLabel) {
// Make sure that kFailureNodeLabel is first.
std::swap(edges_.edges[0], edges_.edges[num_edges()]);
}
--num_free_edges_;
}

void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
DCHECK_NE(kInvalidNodeID, node);
failure_ = node;
if (node != kRootID) {
SetEdge(kFailureNodeLabel, node);
}
}

size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
Expand Down
93 changes: 66 additions & 27 deletions components/url_matcher/substring_set_matcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,19 +115,46 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {

// An edge internal to the tree. We pack the label (character we are
// matching on) and the destination node ID into 32 bits, to save memory.
// We also use these edges as a sort of generic key/value store for
// some special values that not all nodes will have; this also saves on
// memory over the otherwise obvious choice of having them as struct fields,
// as it means we do not to store them when they are not present.
struct AhoCorasickEdge {
// char (unsigned, so [0..255]), or a special label below.
uint32_t label : 9;
NodeID node_id : 23;
};

// Node index that failure edge leads to. The failure node corresponds to
// the node which represents the longest proper suffix (include empty
// string) of the string represented by this node. Not stored if it is
// equal to kRootID (since that is the most common value).
//
// NOTE: Assigning |root| as the failure edge for itself doesn't strictly
// abide by the definition of "proper" suffix. The proper suffix of an empty
// string should probably be defined as null, but we assign it to the |root|
// to simplify the code and have the invariant that the failure edge is always
// defined.
static constexpr uint32_t kFailureNodeLabel = 0x100;
static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel;

// Node index that corresponds to the longest proper suffix (including empty
// suffix) of this node and which also represents the end of a pattern.
// Does not have to exist.
static constexpr uint32_t kOutputLinkLabel = 0x101;

// If present, this node represents the end of a pattern. It stores the ID of
// the corresponding pattern (ie., it is not really a NodeID, but a
// StringPattern::ID).
static constexpr uint32_t kMatchIDLabel = 0x102;

// Used for uninitialized label slots; used so that we do not have to test for
// them in other ways, since we know the data will be initialized and never
// match any other labels.
static constexpr uint32_t kEmptyLabel = 0x103;
static constexpr uint32_t kFirstSpecialLabel = kEmptyLabel;

// A node in the trie.
// A node in the trie, packed tightly together so that it occupies 12 bytes
// (both on 32- and 64-bit platforms).
class AhoCorasickNode {
public:
AhoCorasickNode();
Expand All @@ -154,27 +181,47 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges;
}

NodeID failure() const { return failure_; }
NodeID failure() const {
// NOTE: Even if num_edges_ == 0, we are not doing anything
// undefined, as we will have room for at least two edges
// and empty edges are set to kEmptyLabel.
const AhoCorasickEdge& first_edge = *edges();
if (first_edge.label == kFailureNodeLabel) {
return first_edge.node_id;
} else {
return kRootID;
}
}
void SetFailure(NodeID failure);

void SetMatchID(StringPattern::ID id) {
DCHECK(!IsEndOfPattern());
match_id_ = id;
SetEdge(kMatchIDLabel, id);
has_outputs_ = true;
}

// Returns true if this node corresponds to a pattern.
bool IsEndOfPattern() const {
return match_id_ != StringPattern::kInvalidId;
if (!has_outputs_) {
// Fast reject.
return false;
}
return GetEdge(kMatchIDLabel) != kInvalidNodeID;
}

// Must only be called if |IsEndOfPattern| returns true for this node.
StringPattern::ID GetMatchID() const {
DCHECK(IsEndOfPattern());
return match_id_;
return GetEdge(kMatchIDLabel);
}

void SetOutputLink(NodeID node) { output_link_ = node; }
NodeID output_link() const { return output_link_; }
void SetOutputLink(NodeID node) {
if (node != kInvalidNodeID) {
SetEdge(kOutputLinkLabel, node);
has_outputs_ = true;
}
}
NodeID output_link() const { return GetEdge(kOutputLinkLabel); }

size_t EstimateMemoryUsage() const;
size_t num_edges() const {
Expand All @@ -185,9 +232,7 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
}
}

bool has_outputs() const {
return IsEndOfPattern() || output_link() != kInvalidNodeID;
}
bool has_outputs() const { return has_outputs_; }

private:
// Outgoing edges of current node, including failure edge and output links.
Expand Down Expand Up @@ -216,12 +261,21 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
static constexpr int kNumInlineEdges = 2;
union {
// Out-of-line edge storage, having room for edges_capacity_ elements.
// Note that due to __attribute__((packed)) below, this pointer may be
// unaligned on 64-bit platforms, causing slightly less efficient
// access to it in some cases.
AhoCorasickEdge* edges;

// Inline edge storage, used if edges_capacity_ == 0.
AhoCorasickEdge inline_edges[kNumInlineEdges];
} edges_;

// Whether we have an edge for kMatchIDLabel or kOutputLinkLabel,
// ie., hitting this node during traversal will create one or more
// matches. This is redundant, but since every single lookup during
// traversal needs this, it saves a few searches for us.
bool has_outputs_ = false;

// Number of unused left in edges_. Edges are always allocated from the
// beginning and never deleted; those after num_edges_ will be marked with
// kEmptyLabel (and have an undefined node_id). We store the number of
Expand All @@ -234,22 +288,7 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// kEmptyLabel + 1). If equal to zero, we are not using heap storage,
// but instead are using inline_edges.
uint16_t edges_capacity_ = 0;

// Node index that failure edge leads to. The failure node corresponds to
// the node which represents the longest proper suffix (include empty
// string) of the string represented by this node. Must be valid, equal to
// kInvalidNodeID when uninitialized.
NodeID failure_ = kInvalidNodeID;

// If valid, this node represents the end of a pattern. It stores the ID of
// the corresponding pattern.
StringPattern::ID match_id_ = StringPattern::kInvalidId;

// Node index that corresponds to the longest proper suffix (including empty
// suffix) of this node and which also represents the end of a pattern. Can
// be invalid.
NodeID output_link_ = kInvalidNodeID;
};
} __attribute__((packed));

using SubstringPatternVector = std::vector<const StringPattern*>;

Expand Down

0 comments on commit bc88e55

Please sign in to comment.