Skip to content

Commit 22adec5

Browse files
taku910hiroyuki-komatsu
authored andcommitted
- Cleanup RemoveNgramChain by hiding the internal data for recursive call.
- Introduced RemoveEntryWithInnerSegment to remove the user-specifieed key/value that are the prefix of the entry, considering the inner boundary information. PiperOrigin-RevId: 816101364
1 parent 727f876 commit 22adec5

File tree

3 files changed

+236
-106
lines changed

3 files changed

+236
-106
lines changed

src/prediction/user_history_predictor.cc

Lines changed: 143 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <cstddef>
3636
#include <cstdint>
3737
#include <cstdlib>
38+
#include <functional>
3839
#include <iterator>
3940
#include <limits>
4041
#include <memory>
@@ -585,77 +586,148 @@ void UserHistoryPredictor::EraseNextEntries(uint64_t fp, Entry* entry) {
585586
// chain like
586587
// ("aaa", "AAA") -- ("bbb", "BBB") -- ("ccc", "CCC"),
587588
// and if target_key == "aaabbbccc" and target_value == "AAABBBCCC", the link
588-
// from ("bbb", "BBB") to ("ccc", "CCC") is removed. If a link was removed, this
589-
// method returns DONE. If no history entries can produce the target key
590-
// value, then NOT_FOUND is returned. TAIL is returned only when the
591-
// tail was found, e.g., in the above example, when the method finds the tail
592-
// node ("ccc", "CCC").
593-
UserHistoryPredictor::RemoveNgramChainResult
594-
UserHistoryPredictor::RemoveNgramChain(
589+
// from ("bbb", "BBB") to ("ccc", "CCC") is removed.
590+
// return true if a link was removed.
591+
bool UserHistoryPredictor::RemoveNgramChain(
595592
const absl::string_view target_key, const absl::string_view target_value,
596-
Entry* entry, std::vector<absl::string_view>* key_ngrams,
597-
size_t key_ngrams_len, std::vector<absl::string_view>* value_ngrams,
598-
size_t value_ngrams_len) {
593+
Entry* entry) {
599594
DCHECK(entry);
600-
DCHECK(key_ngrams);
601-
DCHECK(value_ngrams);
602-
603-
// Updates the lengths with the current entry node.
604-
key_ngrams_len += entry->key().size();
605-
value_ngrams_len += entry->value().size();
606-
607-
// This is the case where ngram key and value are shorter than the target key
608-
// and value, respectively. In this case, we need to find further entries to
609-
// concatenate in order to make |target_key| and |target_value|.
610-
if (key_ngrams_len < target_key.size() &&
611-
value_ngrams_len < target_value.size()) {
612-
key_ngrams->push_back(entry->key());
613-
value_ngrams->push_back(entry->value());
614-
for (const uint64_t fp : entry->next_entry_fps()) {
615-
Entry* e = dic_->MutableLookupWithoutInsert(fp);
616-
if (e == nullptr) {
617-
continue;
595+
596+
if (!target_key.starts_with(entry->key()) ||
597+
!target_value.starts_with(entry->value())) {
598+
return false;
599+
}
600+
601+
// Returns value of RemoveNgramChain() method. See the comments in
602+
// implementation.
603+
enum RemoveNgramChainResult {
604+
DONE,
605+
TAIL,
606+
NOT_FOUND,
607+
};
608+
609+
std::vector<absl::string_view> key_ngrams, value_ngrams;
610+
std::function<RemoveNgramChainResult(Entry*, size_t, size_t)>
611+
remove_ngram_chain_internal =
612+
[&](Entry* entry, size_t key_ngrams_len,
613+
size_t value_ngrams_len) -> RemoveNgramChainResult {
614+
// Updates the lengths with the current entry node.
615+
key_ngrams_len += entry->key().size();
616+
value_ngrams_len += entry->value().size();
617+
618+
// This is the case where ngram key and value are shorter than the target
619+
// key and value, respectively. In this case, we need to find further
620+
// entries to concatenate in order to make |target_key| and |target_value|.
621+
if (key_ngrams_len < target_key.size() &&
622+
value_ngrams_len < target_value.size()) {
623+
key_ngrams.push_back(entry->key());
624+
value_ngrams.push_back(entry->value());
625+
for (const uint64_t fp : entry->next_entry_fps()) {
626+
Entry* e = dic_->MutableLookupWithoutInsert(fp);
627+
if (e == nullptr) {
628+
continue;
629+
}
630+
switch (
631+
remove_ngram_chain_internal(e, key_ngrams_len, value_ngrams_len)) {
632+
case DONE:
633+
return DONE;
634+
case TAIL:
635+
// |entry| is the second-to-the-last node. So cut the link to the
636+
// child entry.
637+
EraseNextEntries(fp, entry);
638+
return DONE;
639+
default:
640+
break;
641+
}
618642
}
619-
const RemoveNgramChainResult r =
620-
RemoveNgramChain(target_key, target_value, e, key_ngrams,
621-
key_ngrams_len, value_ngrams, value_ngrams_len);
622-
switch (r) {
623-
case RemoveNgramChainResult::DONE:
624-
return RemoveNgramChainResult::DONE;
625-
case RemoveNgramChainResult::TAIL:
626-
// |entry| is the second-to-the-last node. So cut the link to the
627-
// child entry.
628-
EraseNextEntries(fp, entry);
629-
return RemoveNgramChainResult::DONE;
630-
default:
631-
break;
643+
// Recovers the state.
644+
key_ngrams.pop_back();
645+
value_ngrams.pop_back();
646+
return NOT_FOUND;
647+
}
648+
649+
// This is the case where the current ngram key and value have the same
650+
// lengths as those of |target_key| and |target_value|, respectively.
651+
if (key_ngrams_len == target_key.size() &&
652+
value_ngrams_len == target_value.size()) {
653+
key_ngrams.push_back(entry->key());
654+
value_ngrams.push_back(entry->value());
655+
const std::string ngram_key = absl::StrJoin(key_ngrams, "");
656+
const std::string ngram_value = absl::StrJoin(value_ngrams, "");
657+
if (ngram_key == target_key && ngram_value == target_value) {
658+
// |entry| is the last node. So return TAIL to tell the caller so that
659+
// it can remove the link to this last node.
660+
return TAIL;
632661
}
662+
key_ngrams.pop_back();
663+
value_ngrams.pop_back();
664+
return NOT_FOUND;
633665
}
634-
// Recovers the state.
635-
key_ngrams->pop_back();
636-
value_ngrams->pop_back();
637-
return RemoveNgramChainResult::NOT_FOUND;
666+
667+
return NOT_FOUND;
668+
};
669+
670+
return remove_ngram_chain_internal(entry, 0, 0) == DONE;
671+
}
672+
673+
// static
674+
bool UserHistoryPredictor::RemoveEntryWithInnerSegment(absl::string_view key,
675+
absl::string_view value,
676+
Entry* entry) {
677+
DCHECK(entry);
678+
679+
if (entry->inner_segment_boundary_size() == 0) {
680+
return false;
638681
}
639682

640-
// This is the case where the current ngram key and value have the same
641-
// lengths as those of |target_key| and |target_value|, respectively.
642-
if (key_ngrams_len == target_key.size() &&
643-
value_ngrams_len == target_value.size()) {
644-
key_ngrams->push_back(entry->key());
645-
value_ngrams->push_back(entry->value());
646-
const std::string ngram_key = absl::StrJoin(*key_ngrams, "");
647-
const std::string ngram_value = absl::StrJoin(*value_ngrams, "");
648-
if (ngram_key == target_key && ngram_value == target_value) {
649-
// |entry| is the last node. So return TAIL to tell the caller so
650-
// that it can remove the link to this last node.
651-
return RemoveNgramChainResult::TAIL;
683+
// TODO(taku): Support multiple matches.
684+
685+
// Checks from value as value is less likely matched.
686+
const auto value_start = absl::string_view(entry->value()).find(value);
687+
if (value_start == absl::string_view::npos) return false;
688+
689+
const auto key_start = absl::string_view(entry->key()).find(key);
690+
if (key_start == absl::string_view::npos) return false;
691+
692+
const converter::InnerSegments inner_segments(
693+
entry->key(), entry->value(), entry->inner_segment_boundary());
694+
695+
const uint32_t key_end = key_start + key.size();
696+
const uint32_t value_end = value_start + value.size();
697+
698+
uint32_t key_len = 0;
699+
uint32_t value_len = 0;
700+
bool start_match = false;
701+
702+
for (const auto& it : inner_segments) {
703+
if (key_len == key_start && value_len == value_start) {
704+
start_match = true;
705+
}
706+
707+
// Ends at the key/value or content_key/value boundary.
708+
if ((key_len + it.GetKey().size() == key_end &&
709+
value_len + it.GetValue().size() == value_end) ||
710+
(key_len + it.GetContentKey().size() == key_end &&
711+
value_len + it.GetContentValue().size() == value_end)) {
712+
if (start_match) {
713+
entry->set_suggestion_freq(0);
714+
entry->set_shown_freq(0);
715+
entry->set_removed(true);
716+
return true;
717+
}
718+
return false;
719+
}
720+
721+
key_len += it.GetKey().size();
722+
value_len += it.GetValue().size();
723+
724+
// Exceeds the boundary.
725+
if (!start_match && (key_len > key_start || value_len > value_start)) {
726+
return false;
652727
}
653-
key_ngrams->pop_back();
654-
value_ngrams->pop_back();
655-
return RemoveNgramChainResult::NOT_FOUND;
656728
}
657729

658-
return RemoveNgramChainResult::NOT_FOUND;
730+
return false;
659731
}
660732

661733
bool UserHistoryPredictor::ClearHistoryEntry(const absl::string_view key,
@@ -674,26 +746,26 @@ bool UserHistoryPredictor::ClearHistoryEntry(const absl::string_view key,
674746
deleted = true;
675747
}
676748
}
749+
677750
{
678-
// Finds a chain of history entries that produces key and value. If exists,
679-
// remove the link so that N-gram history prediction never generates this
680-
// key value pair..
751+
// Two case
752+
// 1) Finds a chain of history entries that produces key and value.
753+
// If exists, remove the link so that N-gram history prediction never
754+
// generates this key value pair.
755+
// 2) key/value are the substring of entry.
681756
for (DicElement& elm : *dic_) {
682757
Entry* entry = &elm.value;
683-
if (!key.starts_with(entry->key()) ||
684-
!value.starts_with(entry->value())) {
685-
continue;
686-
}
687-
std::vector<absl::string_view> key_ngrams, value_ngrams;
688-
if (RemoveNgramChain(key, value, entry, &key_ngrams, 0, &value_ngrams,
689-
0) == RemoveNgramChainResult::DONE) {
758+
if (RemoveNgramChain(key, value, entry) ||
759+
RemoveEntryWithInnerSegment(key, value, entry)) {
690760
deleted = true;
691761
}
692762
}
693763
}
764+
694765
if (deleted) {
695766
updated_ = true;
696767
}
768+
697769
return deleted;
698770
}
699771

src/prediction/user_history_predictor.h

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -194,14 +194,6 @@ class UserHistoryPredictor : public PredictorInterface {
194194
EXACT_MATCH, // right string == left string
195195
};
196196

197-
// Returns value of RemoveNgramChain() method. See the comments in
198-
// implementation.
199-
enum class RemoveNgramChainResult {
200-
DONE,
201-
TAIL,
202-
NOT_FOUND,
203-
};
204-
205197
// Returns true if this predictor should return results for the input.
206198
bool ShouldPredict(const ConversionRequest& request) const;
207199

@@ -431,13 +423,16 @@ class UserHistoryPredictor : public PredictorInterface {
431423

432424
static void EraseNextEntries(uint64_t fp, Entry* entry);
433425

434-
// Recursively removes a chain of Entries in |dic_|. See the comment in
435-
// implementation for details.
436-
RemoveNgramChainResult RemoveNgramChain(
437-
absl::string_view target_key, absl::string_view target_value,
438-
Entry* entry, std::vector<absl::string_view>* key_ngrams,
439-
size_t key_ngrams_len, std::vector<absl::string_view>* value_ngrams,
440-
size_t value_ngrams_len);
426+
// Recursively removes a chain of Entries in |dic_|.
427+
// Returns true if at least one chain is removed.
428+
bool RemoveNgramChain(absl::string_view target_key,
429+
absl::string_view target_value, Entry* entry);
430+
431+
// Removes entry when key/value are the prefix of entry with
432+
// the inner boundary constraint. entry->removed() gets true when removed.
433+
static bool RemoveEntryWithInnerSegment(absl::string_view key,
434+
absl::string_view value,
435+
Entry* entry);
441436

442437
// Returns true if the input first candidate seems to be a privacy sensitive
443438
// such like password.

0 commit comments

Comments
 (0)