Skip to content

Commit

Permalink
[string matching] Update Hits in FuzzyTokenizedStringMatch.
Browse files Browse the repository at this point in the history
Update the `hits_` calculation so that it matches the highest matching algorithm.

Also fix a bug of the hits() calculation in the AcronymMatcher.


Bug: b/259443247
Change-Id: I3ba532dd6460fe9b2a124aa33079de46f376e124
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4036950
Reviewed-by: Amanda Deacon <amandadeacon@chromium.org>
Commit-Queue: CJ Huang <chenjih@google.com>
Auto-Submit: CJ Huang <chenjih@google.com>
Cr-Commit-Position: refs/heads/main@{#1073885}
  • Loading branch information
CJ Huang authored and Chromium LUCI CQ committed Nov 21, 2022
1 parent 30595ae commit 2bd7d57
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 45 deletions.
2 changes: 1 addition & 1 deletion chromeos/ash/components/string_matching/acronym_matcher.cc
Expand Up @@ -64,7 +64,7 @@ double AcronymMatcher::CalculateRelevance() {

// update the |hits_| if there is a match.
for (size_t i = 0; i < query_.size(); i++) {
size_t start_pos = text_mapping_[i].start();
size_t start_pos = text_mapping_[found_index + i].start();
hits_.emplace_back(start_pos, start_pos + 1);
}

Expand Down
Expand Up @@ -20,6 +20,9 @@ constexpr double kAbsError = 1e-5;

// Returns a string of |text| marked with the hits in |match| using block
// bracket. e.g. text= "Text", match.hits = [{0,1}], returns "[T]ext".
//
// TODO(crbug.com/1336160): Consider defining it as a |test_util| function as it
// has been used for several unit tests.
std::u16string MatchHit(const std::u16string& text,
const AcronymMatcher& match) {
std::u16string marked = text;
Expand Down Expand Up @@ -104,6 +107,7 @@ TEST_F(AcronymMatcherTest, MatchHit) {
{u"Crash of Crowns", u"coc", u"[C]rash [o]f [C]rowns"},
{u"Crash of Crowns", u"cra", u"Crash of Crowns"},
{u"abcxxx bxxx cxxx", u"abc", u"[a]bcxxx [b]xxx [c]xxx"},
{u"xxx abcxxx bxxx cxxx", u"abc", u"xxx [a]bcxxx [b]xxx [c]xxx"},
};

for (auto& test_case : kTestCases) {
Expand Down
Expand Up @@ -25,6 +25,8 @@ namespace ash::string_matching {

namespace {

using Hits = FuzzyTokenizedStringMatch::Hits;

constexpr double kPartialMatchPenaltyRate = 0.9;

constexpr double kMinScore = 0.0;
Expand All @@ -43,6 +45,10 @@ std::vector<std::u16string> ProcessAndSort(const TokenizedString& text) {
return result;
}

double ScaledRelevance(const double relevance) {
return 1.0 - std::pow(0.5, relevance);
}

} // namespace

FuzzyTokenizedStringMatch::~FuzzyTokenizedStringMatch() = default;
Expand Down Expand Up @@ -209,13 +215,36 @@ double FuzzyTokenizedStringMatch::PrefixMatcher(const TokenizedString& query,
const TokenizedString& text) {
string_matching::PrefixMatcher match(query, text);
match.Match();
return 1.0 - std::pow(0.5, match.relevance());
return ScaledRelevance(match.relevance());
}

double FuzzyTokenizedStringMatch::AcronymMatcher(const TokenizedString& query,
const TokenizedString& text) {
string_matching::AcronymMatcher match(query, text);
return 1.0 - std::pow(0.5, match.CalculateRelevance());
const double relevance = match.CalculateRelevance();
return ScaledRelevance(relevance);
}

double FuzzyTokenizedStringMatch::PrefixMatcher(
const TokenizedString& query,
const TokenizedString& text,
std::vector<Hits>& hits_vector) {
string_matching::PrefixMatcher match(query, text);
match.Match();

hits_vector.emplace_back(match.hits());
return ScaledRelevance(match.relevance());
}

double FuzzyTokenizedStringMatch::AcronymMatcher(
const TokenizedString& query,
const TokenizedString& text,
std::vector<Hits>& hits_vector) {
string_matching::AcronymMatcher match(query, text);
const double relevance = match.CalculateRelevance();

hits_vector.emplace_back(match.hits());
return ScaledRelevance(relevance);
}

double FuzzyTokenizedStringMatch::Relevance(const TokenizedString& query_input,
Expand Down Expand Up @@ -247,44 +276,53 @@ double FuzzyTokenizedStringMatch::Relevance(const TokenizedString& query_input,
return 1.0;
}

// TODO(crbug.com/1336160): Consider calculating |hits_| based on best
// matching result. Currently, both SequenceMatcher and PrefixMatcher have
// their own |hits_|.
//
// Find |hits_| using SequenceMatcher on original query and text.
for (const auto& match :
SequenceMatcher(query_text, text_text).GetMatchingBlocks()) {
if (match.length > 0) {
hits_.emplace_back(match.pos_second_string,
match.pos_second_string + match.length);
}
}

// If the query is much longer than the text then it's often not a match.
if (query_size >= text_size * 2) {
return 0.0;
}

// The |relevances| stores the |relevance_scores| calculated from different
// string matching methods. The highest result among them will be returned.
std::vector<double> relevances;
// The |hits_vector| stores the |hits| calculated from different string
// matching methods. The final selected instance corresponds to the hits
// generated by the matching algorithm which yielded the highest relevance
// score. The final selected instance will be assigned to |hits_| then.
std::vector<Hits> hits_vector;

double prefix_score = PrefixMatcher(query, text);
double prefix_score = PrefixMatcher(query, text, hits_vector);
// A scoring boost for short prefix matching queries.
if (query_size <= kMaxBoostSize && prefix_score > kMinScore) {
prefix_score = std::min(
1.0, prefix_score + 2.0 / (query_size * (query_size + text_size)));
}

relevances.emplace_back(prefix_score);

// Find hits using SequenceMatcher on original query and text.
Hits sequence_hits;
for (const auto& match :
SequenceMatcher(query_text, text_text).GetMatchingBlocks()) {
if (match.length > 0) {
sequence_hits.emplace_back(match.pos_second_string,
match.pos_second_string + match.length);
}
}
hits_vector.emplace_back(sequence_hits);

relevances.emplace_back(
use_weighted_ratio ? WeightedRatio(query, text)
: SequenceMatcher(base::i18n::ToLower(query_text),
base::i18n::ToLower(text_text))
.Ratio(/*use_text_length_agnosticism=*/true));
if (use_acronym_matcher) {
relevances.emplace_back(AcronymMatcher(query, text));
relevances.emplace_back(AcronymMatcher(query, text, hits_vector));
}

return *std::max_element(relevances.begin(), relevances.end());
size_t best_match_pos =
std::max_element(relevances.begin(), relevances.end()) -
relevances.begin();
hits_ = hits_vector[best_match_pos];
return relevances[best_match_pos];
}

} // namespace ash::string_matching
Expand Up @@ -5,6 +5,8 @@
#ifndef CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_
#define CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_

#include <vector>

#include "chromeos/ash/components/string_matching/tokenized_string.h"
#include "ui/gfx/range/range.h"

Expand Down Expand Up @@ -67,9 +69,8 @@ class FuzzyTokenizedStringMatch {
static double WeightedRatio(const TokenizedString& query,
const TokenizedString& text);

// Since prefix match should always be favored over other matches, this
// function is dedicated to calculate a prefix match score in range of [0, 1]
// using PrefixMatcher class.
// This function is dedicated to calculate a prefix match score in range of
// [0, 1] using PrefixMatcher class.
static double PrefixMatcher(const TokenizedString& query,
const TokenizedString& text);

Expand All @@ -89,6 +90,18 @@ class FuzzyTokenizedStringMatch {
const Hits& hits() const { return hits_; }

private:
// This function is dedicated to calculate a prefix match score in range of
// [0, 1] and its hits information using PrefixMatcher class.
static double PrefixMatcher(const TokenizedString& query,
const TokenizedString& text,
std::vector<Hits>& hits_vector);
// This function is dedicated to calculate a first character match (aka
// acronym match) score in range of [0, 1] and its hits information using
// AcronymMatcher class.
static double AcronymMatcher(const TokenizedString& query,
const TokenizedString& text,
std::vector<Hits>& hits_vector);

Hits hits_;
};

Expand Down
Expand Up @@ -4,6 +4,7 @@

#include "chromeos/ash/components/string_matching/fuzzy_tokenized_string_match.h"

#include "base/containers/adapters.h"
#include "base/logging.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
Expand All @@ -25,6 +26,7 @@ constexpr double kCompleteMismatchScore = 0.0;

// Default parameters.
constexpr bool kUseWeightedRatio = false;
constexpr bool kStripDiacritics = false;

void ExpectAllNearlyEqualTo(const std::vector<double>& scores,
double target_score,
Expand Down Expand Up @@ -103,6 +105,23 @@ std::string FormatRelevanceResult(const std::u16string& query,
}
}

// Returns a string of |text| marked with the hits using block
// bracket. e.g. text= "Text", hits = [{0,1}], returns "[T]ext".
//
// TODO(crbug.com/1336160): Consider defining it as a |test_util| function as it
// has been used for several unit tests.
std::u16string MatchHit(const std::u16string& text,
const FuzzyTokenizedStringMatch::Hits& hits) {
std::u16string marked = text;

for (const gfx::Range& hit : base::Reversed(hits)) {
marked.insert(hit.end(), 1, u']');
marked.insert(hit.start(), 1, u'[');
}

return marked;
}

} // namespace

class FuzzyTokenizedStringMatchTest : public testing::Test {};
Expand Down Expand Up @@ -831,25 +850,49 @@ TEST_F(FuzzyTokenizedStringMatchTest,
}

TEST_F(FuzzyTokenizedStringMatchTest,
BenchmarkPrefixVsSameLengthNonPrefixMultiToken) {
BenchmarkHitsMatchHighestMatchingAlgorithm) {
FuzzyTokenizedStringMatch match;
std::u16string text = u"xyzabc abcdef";
std::u16string query = u"abc";
const double relevance = match.Relevance(
TokenizedString(query), TokenizedString(text), kUseWeightedRatio);

VLOG(1) << FormatRelevanceResult(query, text, relevance,
/*query_first*/ false);
struct {
const std::u16string text;
const std::u16string query;
const std::u16string expect;
const bool use_acronym_matcher;
} kTestCases[] = {
// Prefix Matcher is favored over Sequence Matcher.
{u"xyzabc abcdef", u"abc", u"xyzabc [abc]def",
/*use_acronym_matcher=*/true},
{u"xyzabc abcdef", u"abc", u"xyzabc [abc]def",
/*use_acronym_matcher=*/false},
// Acronym Matcher is favored over Sequence Matcher.
{u"dabc axyz bxyz cxzy", u"abc", u"dabc [a]xyz [b]xyz [c]xzy",
/*use_acronym_matcher=*/true},
{u"dabc axyz bxyz cxzy", u"abc", u"d[abc] axyz bxyz cxzy",
/*use_acronym_matcher=*/false},
// Prefix Matcher at first token is favored over Acronym Matcher.
{u"abcxyz bxyz cxzy", u"abc", u"[abc]xyz bxyz cxzy",
/*use_acronym_matcher=*/true},
{u"abcxyz bxyz cxzy", u"abc", u"[abc]xyz bxyz cxzy",
/*use_acronym_matcher=*/false},
// Acronym Matcher is favored over Prefix Matcher at non-first token.
{u"def abcxyz bxyz cxzy", u"abc", u"def [a]bcxyz [b]xyz [c]xzy",
/*use_acronym_matcher=*/true},
{u"def abcxyz bxyz cxzy", u"abc", u"def [abc]xyz bxyz cxzy",
/*use_acronym_matcher=*/false},
};

EXPECT_EQ(match.hits().size(), 1u);
for (auto& test_case : kTestCases) {
const TokenizedString query(test_case.query);
const TokenizedString text(test_case.text);

// TODO(crbug.com/1336160): Currently the "abc" of "abcdef" is matched, but
// the |hit_| marks the "abc" of "xyzabc" instead. Consider an implementation
// which |hit_| matches the highest matching algorithm, instead of the result
// of sequence match, i.e.:
//
// EXPECT_EQ(match.hits()[0].start(), 7u);
// EXPECT_EQ(match.hits()[0].end(), 10u);
double relevance =
match.Relevance(query, text, kUseWeightedRatio, kStripDiacritics,
/*use_acronym_matcher=*/test_case.use_acronym_matcher);

VLOG(1) << FormatRelevanceResult(test_case.query, test_case.text, relevance,
/*query_first*/ false);
EXPECT_EQ(test_case.expect, MatchHit(test_case.text, match.hits()));
}
}

TEST_F(FuzzyTokenizedStringMatchTest,
Expand Down Expand Up @@ -1118,7 +1161,6 @@ TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsScreenshot) {
}
}

// TODO(crbug.com/1323910): Improve word order flexibility.
TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsDesk) {
std::u16string text = u"Create a new desk";
std::u16string text_lower = u"create a new desk";
Expand All @@ -1137,12 +1179,18 @@ TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsDesk) {
u"create a new des",
u"create a new desk"};
std::vector<std::u16string> queries_missing_words = {
u"create a d", u"create a de", u"create a des", u"create a desk",
u"create d", u"create de", u"create des", u"create desk",
u"create n", u"create ne", u"create new", u"create new ",
u"create new d", u"create new de", u"create new des", u"create new desk",
u"new ", u"new d", u"new de", u"new des",
u"new desk", u"desk"};
u"new ", u"desk",
u"new d", u"new de",
u"new des", u"new desk",
u"create d", u"create n",
u"create de", u"create ne",
u"create a d", u"create a de",
u"create new", u"create des",
u"create new ", u"create desk",
u"create a des", u"create new d",
u"create a desk", u"create new de",
u"create new des", u"create new desk",
};

std::vector<double> scores;
for (const auto& query : queries_strict_prefix) {
Expand All @@ -1154,11 +1202,19 @@ TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsDesk) {
// Allow a flexible (rather than strict) increase in scores.
ExpectMostlyIncreasing(scores, /*epsilon*/ 0.005);

scores.clear();
for (const auto& query : queries_missing_words) {
const double relevance = CalculateRelevance(query, text);
scores.push_back(relevance);
VLOG(1) << FormatRelevanceResult(query, text, relevance,
/*query_first*/ false);
}

// With word order flexibility, the scores are expected to increase as the
// query length increases.
//
// Allow a flexible (rather than strict) increase in scores.
ExpectMostlyIncreasing(scores, /*epsilon*/ 0.005);
}

TEST_F(FuzzyTokenizedStringMatchTest, BenchmarkKeyboardShortcutsEmojiPicker) {
Expand Down
Expand Up @@ -18,6 +18,9 @@ namespace {

// Returns a string of |text| marked the hits in |match| using block bracket.
// e.g. text= "Text", match.hits = [{0,1}], returns "[T]ext".
//
// TODO(crbug.com/1336160): Consider defining it as a |test_util| function as it
// has been used for several unit tests.
std::string MatchHit(const std::u16string& text,
const TokenizedStringMatch& match) {
std::u16string marked = text;
Expand Down

0 comments on commit 2bd7d57

Please sign in to comment.