envoyproxy · alyssawilk · May 23, 2024 · May 1, 2024 · May 2, 2024 · May 2, 2024
diff --git a/source/common/common/BUILD b/source/common/common/BUILD
@@ -473,6 +473,18 @@ envoy_cc_library(
     ],
 )
 
+envoy_cc_library(
+    name = "compiled_string_map_lib",
+    hdrs = ["compiled_string_map.h"],
+    external_deps = [
+        "abseil_strings",
+    ],
+    deps = [
+        "//envoy/common:pure_lib",
+        "//source/common/common:assert_lib",
+    ],
+)
+
 envoy_cc_library(
     name = "packed_struct_lib",
     hdrs = ["packed_struct.h"],

diff --git a/source/common/common/compiled_string_map.h b/source/common/common/compiled_string_map.h
@@ -0,0 +1,224 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "envoy/common/pure.h"
+
+#include "source/common/common/assert.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace Envoy {
+
+/**
+ * This is a specialized structure intended for static header maps, but
+ * there may be other use cases.
+ *
+ * See `compiled_string_map.md` for details.
+ */
+template <class Value> class CompiledStringMap {
+  class Node {
+  public:
+    // While it is usual to take a string_view by value, in this
+    // performance-critical context with repeatedly passing the same
+    // value, passing it by reference benchmarks out slightly faster.
+    virtual Value find(const absl::string_view& key) PURE;
+    virtual ~Node() = default;
+  };
+
+  class LeafNode : public Node {
+  public:
+    LeafNode(absl::string_view key, Value&& value) : key_(key), value_(std::move(value)) {}
+    Value find(const absl::string_view& key) override {
+      // String comparison unnecessarily checks size equality first, we can skip
+      // to memcmp here because we already know the sizes are equal.
+      // Since this is a super-hot path we don't even ASSERT here, to avoid adding
+      // slowdown in debug builds.
+      if (memcmp(key.data(), key_.data(), key.size())) {
+        return {};
+      }
+      return value_;
+    }
+
+  private:
+    const std::string key_;
+    const Value value_;
+  };
+
+  class BranchNode : public Node {
+  public:
+    BranchNode(size_t index, uint8_t min, std::vector<std::unique_ptr<Node>>&& branches)
+        : index_(index), min_(min), branches_(std::move(branches)) {}
+    Value find(const absl::string_view& key) override {
+      const uint8_t k = static_cast<uint8_t>(key[index_]);
+      // Possible optimization was tried here, populating empty nodes with
+      // a function that returns {} to reduce branching vs checking for null
+      // nodes. Checking for null nodes benchmarked faster.
+      if (k < min_ || k >= min_ + branches_.size() || branches_[k - min_] == nullptr) {
+        return {};
+      }
+      return branches_[k - min_]->find(key);
+    }
+
+  private:
+    const size_t index_;
+    const uint8_t min_;
+    // Possible optimization was tried here, using std::array<std::unique_ptr<Node>, 256>
+    // rather than a smaller-range vector with bounds, to keep locality and reduce
+    // comparisons. It didn't help.
+    const std::vector<std::unique_ptr<Node>> branches_;
+  };
+
+public:
+  // The caller owns the string-views during `compile`. Ownership of the passed in
+  // Values is transferred to the CompiledStringMap.
+  using KV = std::pair<absl::string_view, Value>;
+  /**
+   * Returns the value with a matching key, or the default value
+   * (typically nullptr) if the key was not present.
+   * @param key the key to look up.
+   */
+  Value find(absl::string_view key) const {
+    const size_t key_size = key.size();
+    // Theoretically we could also bottom-cap the size range, but the
+    // cost of the extra comparison and operation would almost certainly
+    // outweigh the benefit of omitting 4 or 5 entries.
+    if (key_size >= table_.size() || table_[key_size] == nullptr) {
+      return {};
+    }
+    return table_[key_size]->find(key);
+  };
+  /**
+   * Construct the lookup table. This can be a somewhat slow multi-pass
+   * operation if the input table is large.
+   * @param contents a vector of key->value pairs. This is taken by value because
+   *                 we're going to modify it. If the caller still wants the original
+   *                 then it can be copied in, if not it can be moved in.
+   *                 Note that the keys are string_views - the base string data must
+   *                 exist for the duration of compile(). The leaf nodes take copies
+   *                 of the key strings, so the string_views can be invalidated once
+   *                 compile has completed.
+   */
+  void compile(std::vector<KV> contents) {
+    if (contents.empty()) {
+      return;
+    }
+    std::sort(contents.begin(), contents.end(),
+              [](const KV& a, const KV& b) { return a.first.size() < b.first.size(); });
+    const size_t longest = contents.back().first.size();
+    // A key length of 0 is possible, and also we don't want to have to
+    // subtract [min length] every time we index, so the table size must
+    // be one larger than the longest key.
+    table_.resize(longest + 1);
+    auto range_start = contents.begin();
+    // Populate the sub-nodes for each length of key that exists.
+    while (range_start != contents.end()) {
+      // Find the first key whose length differs from the current key length.
+      // Everything in between is keys with the same length.
+      const auto range_end =
+          std::find_if(range_start, contents.end(), [len = range_start->first.size()](const KV& e) {
+            return e.first.size() != len;
+          });
+      std::vector<KV> node_contents;
+      // Populate a Node for the entries in that range.
+      node_contents.reserve(range_end - range_start);
+      std::move(range_start, range_end, std::back_inserter(node_contents));
+      table_[range_start->first.size()] = createEqualLengthNode(node_contents);
+      range_start = range_end;
+    }
+  }
+
+private:
+  /**
+   * Details of a node branch point; the index into the string at which
+   * characters should be looked up, the lowest valued character in the
+   * branch, the highest valued character in the branch, and how many
+   * branches there are.
+   */
+  struct IndexSplitInfo {
+    // The index to the character being considered for this split.
+    size_t index_;
+    // The smallest character value that appears at this index.
+    uint8_t min_;
+    // The largest character value that appears at this index.
+    uint8_t max_;
+    // The number of distinct characters that appear at this index.
+    uint8_t count_;
+    size_t size() const { return max_ - min_ + 1; }
+    size_t offsetOf(uint8_t c) const { return c - min_; }
+  };
+
+  /**
+   * @param node_contents the key-value pairs to be branched upon.
+   * @return details of the index on which the node should branch
+   *         - the index which produces the most child branches.
+   */
+  static IndexSplitInfo findBestSplitPoint(const std::vector<KV>& node_contents) {
+    ASSERT(node_contents.size() > 1);
+    IndexSplitInfo best{0, 0, 0, 0};
+    const size_t key_length = node_contents[0].first.size();
+    for (size_t i = 0; i < key_length; i++) {
+      std::array<bool, 256> hits{};
+      IndexSplitInfo info{static_cast<uint8_t>(i), 255, 0, 0};
+      for (const KV& pair : node_contents) {
+        uint8_t v = pair.first[i];
+        if (!hits[v]) {
+          hits[v] = true;
+          info.count_++;
+          info.min_ = std::min(v, info.min_);
+          info.max_ = std::max(v, info.max_);
+        }
+      }
+      if (info.count_ > best.count_) {
+        best = info;
+      }
+    }
+    ASSERT(best.count_ > 1, absl::StrCat("duplicate key: ", node_contents[0].first));
+    return best;
+  }
+
+  /*
+   * @param node_contents the set of key-value pairs that will be children of
+   *                      this node.
+   * @return the recursively generated tree node that leads to all of node_contents.
+   *         If there is only one entry in node_contents then a LeafNode, otherwise a BranchNode.
+   */
+  static std::unique_ptr<Node> createEqualLengthNode(std::vector<KV> node_contents) {
+    if (node_contents.size() == 1) {
+      return std::make_unique<LeafNode>(node_contents[0].first, std::move(node_contents[0].second));
+    }
+    // best contains the index at which this node should be split,
+    // and the smallest and largest character values that occur at
+    // that index across all the keys in node_contents.
+    const IndexSplitInfo best = findBestSplitPoint(node_contents);
+    std::vector<std::unique_ptr<Node>> nodes;
+    nodes.resize(best.size());
+    std::sort(node_contents.begin(), node_contents.end(),
+              [index = best.index_](const KV& a, const KV& b) {
+                return a.first[index] < b.first[index];
+              });
+    auto range_start = node_contents.begin();
+    // Populate the sub-nodes for each character-branch.
+    while (range_start != node_contents.end()) {
+      // Find the first key whose character at position [best.index_] differs from the
+      // character of the current range.
+      // Everything in the range has keys with the same character at this index.
+      auto range_end = std::find_if(range_start, node_contents.end(),
+                                    [index = best.index_, c = range_start->first[best.index_]](
+                                        const KV& e) { return e.first[index] != c; });
+      std::vector<KV> next_contents;
+      next_contents.reserve(range_end - range_start);
+      std::move(range_start, range_end, std::back_inserter(next_contents));
+      nodes[best.offsetOf(range_start->first[best.index_])] = createEqualLengthNode(next_contents);
+      range_start = range_end;
+    }
+    return std::make_unique<BranchNode>(best.index_, best.min_, std::move(nodes));
+  }
+  std::vector<std::unique_ptr<Node>> table_;
+};
+
+} // namespace Envoy
diff --git a/source/common/common/compiled_string_map.md b/source/common/common/compiled_string_map.md
@@ -0,0 +1,144 @@
+# Compiled string map algorithm
+
+## The trie-like structure
+
+The data structure consists of:
+1. a length branch node - strings are grouped by length and a zero-indexed length lookup table is generated. Entries in the table may be nullptr, indicating there are no strings of that length, or a pointer to another node.
+2. a branch node - similar to a standard trie, a node branches by a set of characters at an index. Unlike a standard trie, the index is not the "first" index, but instead is the index at which the most branches would be generated. The node contains the index to be branched on, the lowest character value in the branches, and a vector from lowest-to-highest character value, e.g. if branches c and f exist, a vector representing [c][d][e][f] will be in the node, with the [d] and [e] branches being nullptr.
+3. a leaf node. Leaf nodes contain the entire string for final validation, and the value to be returned if the search key matches the string.
+
+## The compile step
+
+```
+ ┌───────────────────┐
+ │ x-envoy-banana    │
+ │ x-envoy-pineapple │
+ │ x-envoy-babana    │
+ │ x-envoy-grape     │
+ │ x-envoy-bacana    ├──┐
+ │ x-envoy-banara    │  │
+ │ something-else    │  │
+ └───────────────────┘  │
+                        ▼
+                   split by length
+                        │
+                        │
+                        │  ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐
+                        └─►│ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│16│17│
+                           └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴┬─┴┬─┴──┴──┴┬─┘
+                                                                   │  │        │
+      ┌────────────────────────────────────────────────────────────┘  │        │
+      │                                                               │        │
+      ▼                ┌──────────────────────────────────────────────┘        │
+┌─────────────┐        │                                                       ▼
+│x-envoy-grape│        │                                        ┌─────────────────┐
+└─────────────┘        │                                        │x-envoy-pineapple│
+                       ▼                                        └─────────────────┘
+           ┌────────────────┐
+           │ x-envoy-banana │
+           │ x-envoy-babana │
+           │ x-envoy-bacana │
+           │ x-envoy-banara │
+           │ something-else │
+           └─┬──────────────┘
+             │
+             │ Find best branch index (maximum unique branches)
+             │
+             │
+             ▼
+
+            x-envoy-banana
+                      b r
+                      c
+            something-else
+
+            22222222224232
+
+                      ^ best index is here with 4 branches, n b c and e
+             │
+             │
+             │
+             ▼ branch node at position 10, index 0 = b
+            ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐
+            │b │c │d │e │f │g │h │i │j │k │l │m │n │
+            └┬─┴┬─┴──┴┬─┴──┴──┴──┴──┴──┴──┴──┴──┴┬─┘
+             │  │     │                          │
+             ▼  │     ▼                          ▼
+x-envoy-babana  │  something-else        ┌────────────────┐
+                ▼                        │ x-envoy-banana │
+    x-envoy-bacana                       │ x-envoy-banara │
+                                         └───────┬────────┘
+                                                 │
+                                                 ▼
+                                         Find best index
+                                         it's position 12 with 2 branches
+                                                 │
+  branch node at position 12, index 0 = n        ▼
+                                   ┌──┬──┬──┬──┬──┐
+                                   │n │o │p │q │r │
+                                   └┬─┴──┴──┴──┴┬─┘
+                                    │           │
+                                    ▼           ▼
+                       x-envoy-banana         x-envoy-banara
+```
+
+## The lookup
+
+A lookup operation simply walks the generated tree in much the same way as a regular trie.
+For example, given the tree generated above, if you were to search for the header `sponge`,
+the length lookup would find nullptr at index 6, and the resulting value would be the null
+value.
+
+If you were to search for `x-envoy-banaka`, the length would find the branch with most of
+the entries, the index 10 branch would take branch `n`, and the index 12 branch would see that
+`k` is lower than the minimum index in the vector, so the result would be the null value.
+
+If you were to search for `y-envoy-banara`, the length would find the branch with most of
+the entries, the index 10 branch would take branch `n`, the index 12 branch would take
+branch `r`, and the leaf node would do a final string compare, see that
+`y-envoy-barana` != `x-envoy-banara`, and the result would be the null value.
+
+## Performance
+
+In most cases this will be faster than a regular trie, especially for hits - for example,
+instead of 14 steps to match `x-envoy-banana` in the example above, it takes 3 and a
+final memory comparison (which is much faster than one character at a time). For misses,
+some cases will be faster (when the target shares a prefix with an entry), but some will
+be very slightly slower due to more expensive comparisons and dynamic function selection
+in the compiled version.
+
+Benchmark versus a fixed-size 256-branch trie, for the static header map - hit benchmark
+searches for the full range of static headers, miss benchmark used a small set of arbitrary
+non-matching headers:
+
+Benchmark | trie | compiled | trieStdDev | compiledStdDev | change
+-- | -- | -- | -- | -- | --
+bmHeaderMapImplRequestStaticLookupHits | 47.2ns | 16.4ns | 0.629 | 0.378 | -65.3%
+bmHeaderMapImplResponseStaticLookupHits | 34.7ns | 14.3ns | 0.571 | 0.085 | -58.8%
+bmHeaderMapImplRequestStaticLookupMisses | 6.89ns | 6.83ns | 0.044 | 0.034 | -0.01%
+bmHeaderMapImplResponseStaticLookupMisses | 6.40ns | 7.31ns | 0.028 | 0.057 | +14.2%
+
+Versus an `absl::flat_hash_map`, hits are slightly faster and misses are significantly
+faster. (Benchmark has been lost to time.)
+
+In terms of memory, unlike a regular trie, this structure must contain the full keys
+in the leaf nodes, which could potentially make it larger due to, e.g. containing
+`x-envoy-` multiple times where a regular trie only has one 'branch' for that.
+However, it also contains fewer nodes due to not having a node for every character,
+and compared to the prior fixed-size-node 256-branch trie, each node is a lot smaller;
+each node in the prior trie used 8kb, versus an average node in this trie uses less
+than 60 bytes.
+
+## Limitations
+
+This structure is only useful for non-dynamic data - the cost of the compile step
+will outweigh any lookup benefits if the contents need to be modified.
+
+Unlike a regular trie, this structure does not facilitate prefix-matching - you
+can't find a "nearest prefix" or a "longest common prefix".
+
+A `gperf` hash would likely be faster than this for hits and slightly slower for
+misses, but also has the additional constraint that the contents must be known
+before binary-compile-time; envoy's use-case supports extensions dynamically
+adding to the contents, which therefore precludes `gperf`, or at least makes it
+impractical.
diff --git a/source/common/http/BUILD b/source/common/http/BUILD
@@ -449,6 +449,7 @@ envoy_cc_library(
         ":headers_lib",
         "//envoy/http:header_map_interface",
         "//source/common/common:assert_lib",
+        "//source/common/common:compiled_string_map_lib",
         "//source/common/common:dump_state_utils",
         "//source/common/common:empty_string",
         "//source/common/common:non_copyable",