[string matching] Update Hits in FuzzyTokenizedStringMatch.

Update the `hits_` calculation so that it matches the highest matching algorithm. Also fix a bug of the hits() calculation in the AcronymMatcher. Bug: b/259443247 Change-Id: I3ba532dd6460fe9b2a124aa33079de46f376e124 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4036950 Reviewed-by: Amanda Deacon <amandadeacon@chromium.org> Commit-Queue: CJ Huang <chenjih@google.com> Auto-Submit: CJ Huang <chenjih@google.com> Cr-Commit-Position: refs/heads/main@{#1073885}
chromium · Nov 21, 2022 · 2bd7d57 · 2bd7d57
1 parent 30595ae
commit 2bd7d57
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 45 deletions.
diff --git a/chromeos/ash/components/string_matching/acronym_matcher.cc b/chromeos/ash/components/string_matching/acronym_matcher.cc
@@ -64,7 +64,7 @@ double AcronymMatcher::CalculateRelevance() {
 
   // update the |hits_| if there is a match.
   for (size_t i = 0; i < query_.size(); i++) {
-    size_t start_pos = text_mapping_[i].start();
+    size_t start_pos = text_mapping_[found_index + i].start();
     hits_.emplace_back(start_pos, start_pos + 1);
   }
 

diff --git a/chromeos/ash/components/string_matching/acronym_matcher_unittest.cc b/chromeos/ash/components/string_matching/acronym_matcher_unittest.cc
@@ -20,6 +20,9 @@ constexpr double kAbsError = 1e-5;
 
 // Returns a string of |text| marked with the hits in |match| using block
 // bracket. e.g. text= "Text", match.hits = [{0,1}], returns "[T]ext".
+//
+// TODO(crbug.com/1336160): Consider defining it as a |test_util| function as it
+// has been used for several unit tests.
 std::u16string MatchHit(const std::u16string& text,
                         const AcronymMatcher& match) {
   std::u16string marked = text;
@@ -104,6 +107,7 @@ TEST_F(AcronymMatcherTest, MatchHit) {
       {u"Crash of Crowns", u"coc", u"[C]rash [o]f [C]rowns"},
       {u"Crash of Crowns", u"cra", u"Crash of Crowns"},
       {u"abcxxx bxxx cxxx", u"abc", u"[a]bcxxx [b]xxx [c]xxx"},
+      {u"xxx abcxxx bxxx cxxx", u"abc", u"xxx [a]bcxxx [b]xxx [c]xxx"},
   };
 
   for (auto& test_case : kTestCases) {

diff --git a/chromeos/ash/components/string_matching/fuzzy_tokenized_string_match.cc b/chromeos/ash/components/string_matching/fuzzy_tokenized_string_match.cc
@@ -25,6 +25,8 @@ namespace ash::string_matching {
 
 namespace {
 
+using Hits = FuzzyTokenizedStringMatch::Hits;
+
 constexpr double kPartialMatchPenaltyRate = 0.9;
 
 constexpr double kMinScore = 0.0;
@@ -43,6 +45,10 @@ std::vector<std::u16string> ProcessAndSort(const TokenizedString& text) {
   return result;
 }
 
+double ScaledRelevance(const double relevance) {
+  return 1.0 - std::pow(0.5, relevance);
+}
+
 }  // namespace
 
 FuzzyTokenizedStringMatch::~FuzzyTokenizedStringMatch() = default;
@@ -209,13 +215,36 @@ double FuzzyTokenizedStringMatch::PrefixMatcher(const TokenizedString& query,
                                                 const TokenizedString& text) {
   string_matching::PrefixMatcher match(query, text);
   match.Match();
-  return 1.0 - std::pow(0.5, match.relevance());
+  return ScaledRelevance(match.relevance());
 }
 
 double FuzzyTokenizedStringMatch::AcronymMatcher(const TokenizedString& query,
                                                  const TokenizedString& text) {
   string_matching::AcronymMatcher match(query, text);
-  return 1.0 - std::pow(0.5, match.CalculateRelevance());
+  const double relevance = match.CalculateRelevance();
+  return ScaledRelevance(relevance);
+}
+
+double FuzzyTokenizedStringMatch::PrefixMatcher(
+    const TokenizedString& query,
+    const TokenizedString& text,
+    std::vector<Hits>& hits_vector) {
+  string_matching::PrefixMatcher match(query, text);
+  match.Match();
+
+  hits_vector.emplace_back(match.hits());
+  return ScaledRelevance(match.relevance());
+}
+
+double FuzzyTokenizedStringMatch::AcronymMatcher(
+    const TokenizedString& query,
+    const TokenizedString& text,
+    std::vector<Hits>& hits_vector) {
+  string_matching::AcronymMatcher match(query, text);
+  const double relevance = match.CalculateRelevance();
+
+  hits_vector.emplace_back(match.hits());
+  return ScaledRelevance(relevance);
 }
 
 double FuzzyTokenizedStringMatch::Relevance(const TokenizedString& query_input,
@@ -247,44 +276,53 @@ double FuzzyTokenizedStringMatch::Relevance(const TokenizedString& query_input,
     return 1.0;
   }
 
-  // TODO(crbug.com/1336160): Consider calculating |hits_| based on best
-  // matching result. Currently, both SequenceMatcher and PrefixMatcher have
-  // their own |hits_|.
-  //
-  // Find |hits_| using SequenceMatcher on original query and text.
-  for (const auto& match :
-       SequenceMatcher(query_text, text_text).GetMatchingBlocks()) {
-    if (match.length > 0) {
-      hits_.emplace_back(match.pos_second_string,
-                         match.pos_second_string + match.length);
-    }
-  }
-
   // If the query is much longer than the text then it's often not a match.
   if (query_size >= text_size * 2) {
     return 0.0;
   }
 
+  // The |relevances| stores the |relevance_scores| calculated from different
+  // string matching methods. The highest result among them will be returned.
   std::vector<double> relevances;
+  // The |hits_vector| stores the |hits| calculated from different string
+  // matching methods. The final selected instance corresponds to the hits
+  // generated by the matching algorithm which yielded the highest relevance
+  // score. The final selected instance will be assigned to |hits_| then.
+  std::vector<Hits> hits_vector;
 
-  double prefix_score = PrefixMatcher(query, text);
+  double prefix_score = PrefixMatcher(query, text, hits_vector);
   // A scoring boost for short prefix matching queries.
   if (query_size <= kMaxBoostSize && prefix_score > kMinScore) {
     prefix_score = std::min(
         1.0, prefix_score + 2.0 / (query_size * (query_size + text_size)));
   }
-
   relevances.emplace_back(prefix_score);
+
+  // Find hits using SequenceMatcher on original query and text.
+  Hits sequence_hits;
+  for (const auto& match :
+       SequenceMatcher(query_text, text_text).GetMatchingBlocks()) {
+    if (match.length > 0) {
+      sequence_hits.emplace_back(match.pos_second_string,
+                                 match.pos_second_string + match.length);
+    }
+  }
+  hits_vector.emplace_back(sequence_hits);
+
   relevances.emplace_back(
       use_weighted_ratio ? WeightedRatio(query, text)
                          : SequenceMatcher(base::i18n::ToLower(query_text),
                                            base::i18n::ToLower(text_text))
                                .Ratio(/*use_text_length_agnosticism=*/true));
   if (use_acronym_matcher) {
-    relevances.emplace_back(AcronymMatcher(query, text));
+    relevances.emplace_back(AcronymMatcher(query, text, hits_vector));
   }
 
-  return *std::max_element(relevances.begin(), relevances.end());
+  size_t best_match_pos =
+      std::max_element(relevances.begin(), relevances.end()) -
+      relevances.begin();
+  hits_ = hits_vector[best_match_pos];
+  return relevances[best_match_pos];
 }
 
 }  // namespace ash::string_matching
diff --git a/chromeos/ash/components/string_matching/fuzzy_tokenized_string_match.h b/chromeos/ash/components/string_matching/fuzzy_tokenized_string_match.h
@@ -5,6 +5,8 @@
 #ifndef CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_
 #define CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_
 
+#include <vector>
+
 #include "chromeos/ash/components/string_matching/tokenized_string.h"
 #include "ui/gfx/range/range.h"
 
@@ -67,9 +69,8 @@ class FuzzyTokenizedStringMatch {
   static double WeightedRatio(const TokenizedString& query,
                               const TokenizedString& text);
 
-  // Since prefix match should always be favored over other matches, this
-  // function is dedicated to calculate a prefix match score in range of [0, 1]
-  // using PrefixMatcher class.
+  // This function is dedicated to calculate a prefix match score in range of
+  // [0, 1] using PrefixMatcher class.
   static double PrefixMatcher(const TokenizedString& query,
                               const TokenizedString& text);
 
@@ -89,6 +90,18 @@ class FuzzyTokenizedStringMatch {
   const Hits& hits() const { return hits_; }
 
  private:
+  // This function is dedicated to calculate a prefix match score in range of
+  // [0, 1] and its hits information using PrefixMatcher class.
+  static double PrefixMatcher(const TokenizedString& query,
+                              const TokenizedString& text,
+                              std::vector<Hits>& hits_vector);
+  // This function is dedicated to calculate a first character match (aka
+  // acronym match) score in range of [0, 1] and its hits information using
+  // AcronymMatcher class.
+  static double AcronymMatcher(const TokenizedString& query,
+                               const TokenizedString& text,
+                               std::vector<Hits>& hits_vector);
+
   Hits hits_;
 };
 

diff --git a/chromeos/ash/components/string_matching/fuzzy_tokenized_string_match_unittest.cc b/chromeos/ash/components/string_matching/fuzzy_tokenized_string_match_unittest.cc
@@ -4,6 +4,7 @@
 
 #include "chromeos/ash/components/string_matching/fuzzy_tokenized_string_match.h"
 
+#include "base/containers/adapters.h"
 #include "base/logging.h"
 #include "base/strings/stringprintf.h"
 #include "base/strings/utf_string_conversions.h"
@@ -25,6 +26,7 @@ constexpr double kCompleteMismatchScore = 0.0;
 
 // Default parameters.
 constexpr bool kUseWeightedRatio = false;
+constexpr bool kStripDiacritics = false;
 
 void ExpectAllNearlyEqualTo(const std::vector<double>& scores,
                             double target_score,
@@ -103,6 +105,23 @@ std::string FormatRelevanceResult(const std::u16string& query,
   }
 }
 
+// Returns a string of |text| marked with the hits using block
+// bracket. e.g. text= "Text", hits = [{0,1}], returns "[T]ext".
+//
+// TODO(crbug.com/1336160): Consider defining it as a |test_util| function as it
+// has been used for several unit tests.
+std::u16string MatchHit(const std::u16string& text,
+                        const FuzzyTokenizedStringMatch::Hits& hits) {
+  std::u16string marked = text;
+
+  for (const gfx::Range& hit : base::Reversed(hits)) {
+    marked.insert(hit.end(), 1, u']');
+    marked.insert(hit.start(), 1, u'[');
+  }
+
+  return marked;
+}
+
 }  // namespace
 
 class FuzzyTokenizedStringMatchTest : public testing::Test {};
@@ -831,25 +850,49 @@ TEST_F(FuzzyTokenizedStringMatchTest,
 }
 
 TEST_F(FuzzyTokenizedStringMatchTest,
-       BenchmarkPrefixVsSameLengthNonPrefixMultiToken) {
+       BenchmarkHitsMatchHighestMatchingAlgorithm) {
   FuzzyTokenizedStringMatch match;
-  std::u16string text = u"xyzabc abcdef";
-  std::u16string query = u"abc";
-  const double relevance = match.Relevance(
-      TokenizedString(query), TokenizedString(text), kUseWeightedRatio);
 
-  VLOG(1) << FormatRelevanceResult(query, text, relevance,
-                                   /*query_first*/ false);
+  struct {
+    const std::u16string text;
+    const std::u16string query;
+    const std::u16string expect;
+    const bool use_acronym_matcher;
+  } kTestCases[] = {
+      // Prefix Matcher is favored over Sequence Matcher.
+      {u"xyzabc abcdef", u"abc", u"xyzabc [abc]def",
+       /*use_acronym_matcher=*/true},
+      {u"xyzabc abcdef", u"abc", u"xyzabc [abc]def",
+       /*use_acronym_matcher=*/false},
+      // Acronym Matcher is favored over Sequence Matcher.
+      {u"dabc axyz bxyz cxzy", u"abc", u"dabc [a]xyz [b]xyz [c]xzy",
+       /*use_acronym_matcher=*/true},
+      {u"dabc axyz bxyz cxzy", u"abc", u"d[abc] axyz bxyz cxzy",
+       /*use_acronym_matcher=*/false},
+      // Prefix Matcher at first token is favored over Acronym Matcher.
+      {u"abcxyz bxyz cxzy", u"abc", u"[abc]xyz bxyz cxzy",
+       /*use_acronym_matcher=*/true},
+      {u"abcxyz bxyz cxzy", u"abc", u"[abc]xyz bxyz cxzy",
+       /*use_acronym_matcher=*/false},
+      // Acronym Matcher is favored over Prefix Matcher at non-first token.
+      {u"def abcxyz bxyz cxzy", u"abc", u"def [a]bcxyz [b]xyz [c]xzy",
+       /*use_acronym_matcher=*/true},
+      {u"def abcxyz bxyz cxzy", u"abc", u"def [abc]xyz bxyz cxzy",
+       /*use_acronym_matcher=*/false},
+  };
 
-  EXPECT_EQ(match.hits().size(), 1u);
+  for (auto& test_case : kTestCases) {
+    const TokenizedString query(test_case.query);
+    const TokenizedString text(test_case.text);
 
-  // TODO(crbug.com/1336160): Currently the "abc" of "abcdef" is matched, but
-  // the |hit_| marks the "abc" of "xyzabc" instead. Consider an implementation
-  // which |hit_| matches the highest matching algorithm, instead of the result
-  // of sequence match, i.e.:
-  //
-  //   EXPECT_EQ(match.hits()[0].start(), 7u);
-  //   EXPECT_EQ(match.hits()[0].end(), 10u);
+    double relevance =
+        match.Relevance(query, text, kUseWeightedRatio, kStripDiacritics,
+                        /*use_acronym_matcher=*/test_case.use_acronym_matcher);
+
+    VLOG(1) << FormatRelevanceResult(test_case.query, test_case.text, relevance,
+                                     /*query_first*/ false);
+    EXPECT_EQ(test_case.expect, MatchHit(test_case.text, match.hits()));
+  }
 }
 
 TEST_F(FuzzyTokenizedStringMatchTest,
@@ -1118,7 +1161,6 @@ TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsScreenshot) {
   }
 }
 
-// TODO(crbug.com/1323910): Improve word order flexibility.
 TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsDesk) {
   std::u16string text = u"Create a new desk";
   std::u16string text_lower = u"create a new desk";
@@ -1137,12 +1179,18 @@ TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsDesk) {
                                                        u"create a new des",
                                                        u"create a new desk"};
   std::vector<std::u16string> queries_missing_words = {
-      u"create a d",   u"create a de",   u"create a des",   u"create a desk",
-      u"create d",     u"create de",     u"create des",     u"create desk",
-      u"create n",     u"create ne",     u"create new",     u"create new ",
-      u"create new d", u"create new de", u"create new des", u"create new desk",
-      u"new ",         u"new d",         u"new de",         u"new des",
-      u"new desk",     u"desk"};
+      u"new ",           u"desk",
+      u"new d",          u"new de",
+      u"new des",        u"new desk",
+      u"create d",       u"create n",
+      u"create de",      u"create ne",
+      u"create a d",     u"create a de",
+      u"create new",     u"create des",
+      u"create new ",    u"create desk",
+      u"create a des",   u"create new d",
+      u"create a desk",  u"create new de",
+      u"create new des", u"create new desk",
+  };
 
   std::vector<double> scores;
   for (const auto& query : queries_strict_prefix) {
@@ -1154,11 +1202,19 @@ TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsDesk) {
   // Allow a flexible (rather than strict) increase in scores.
   ExpectMostlyIncreasing(scores, /*epsilon*/ 0.005);
 
+  scores.clear();
   for (const auto& query : queries_missing_words) {
     const double relevance = CalculateRelevance(query, text);
+    scores.push_back(relevance);
     VLOG(1) << FormatRelevanceResult(query, text, relevance,
                                      /*query_first*/ false);
   }
+
+  // With word order flexibility, the scores are expected to increase as the
+  // query length increases.
+  //
+  // Allow a flexible (rather than strict) increase in scores.
+  ExpectMostlyIncreasing(scores, /*epsilon*/ 0.005);
 }
 
 TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsEmojiPicker) {

diff --git a/chromeos/ash/components/string_matching/tokenized_string_match_unittest.cc b/chromeos/ash/components/string_matching/tokenized_string_match_unittest.cc
@@ -18,6 +18,9 @@ namespace {
 
 // Returns a string of |text| marked the hits in |match| using block bracket.
 // e.g. text= "Text", match.hits = [{0,1}], returns "[T]ext".
+//
+// TODO(crbug.com/1336160): Consider defining it as a |test_util| function as it
+// has been used for several unit tests.
 std::string MatchHit(const std::u16string& text,
                      const TokenizedStringMatch& match) {
   std::u16string marked = text;