35
35
#include < cstddef>
36
36
#include < cstdint>
37
37
#include < cstdlib>
38
+ #include < functional>
38
39
#include < iterator>
39
40
#include < limits>
40
41
#include < memory>
@@ -585,77 +586,148 @@ void UserHistoryPredictor::EraseNextEntries(uint64_t fp, Entry* entry) {
585
586
// chain like
586
587
// ("aaa", "AAA") -- ("bbb", "BBB") -- ("ccc", "CCC"),
587
588
// and if target_key == "aaabbbccc" and target_value == "AAABBBCCC", the link
588
- // from ("bbb", "BBB") to ("ccc", "CCC") is removed. If a link was removed, this
589
- // method returns DONE. If no history entries can produce the target key
590
- // value, then NOT_FOUND is returned. TAIL is returned only when the
591
- // tail was found, e.g., in the above example, when the method finds the tail
592
- // node ("ccc", "CCC").
593
- UserHistoryPredictor::RemoveNgramChainResult
594
- UserHistoryPredictor::RemoveNgramChain (
589
+ // from ("bbb", "BBB") to ("ccc", "CCC") is removed.
590
+ // return true if a link was removed.
591
+ bool UserHistoryPredictor::RemoveNgramChain (
595
592
const absl::string_view target_key, const absl::string_view target_value,
596
- Entry* entry, std::vector<absl::string_view>* key_ngrams,
597
- size_t key_ngrams_len, std::vector<absl::string_view>* value_ngrams,
598
- size_t value_ngrams_len) {
593
+ Entry* entry) {
599
594
DCHECK (entry);
600
- DCHECK (key_ngrams);
601
- DCHECK (value_ngrams);
602
-
603
- // Updates the lengths with the current entry node.
604
- key_ngrams_len += entry->key ().size ();
605
- value_ngrams_len += entry->value ().size ();
606
-
607
- // This is the case where ngram key and value are shorter than the target key
608
- // and value, respectively. In this case, we need to find further entries to
609
- // concatenate in order to make |target_key| and |target_value|.
610
- if (key_ngrams_len < target_key.size () &&
611
- value_ngrams_len < target_value.size ()) {
612
- key_ngrams->push_back (entry->key ());
613
- value_ngrams->push_back (entry->value ());
614
- for (const uint64_t fp : entry->next_entry_fps ()) {
615
- Entry* e = dic_->MutableLookupWithoutInsert (fp);
616
- if (e == nullptr ) {
617
- continue ;
595
+
596
+ if (!target_key.starts_with (entry->key ()) ||
597
+ !target_value.starts_with (entry->value ())) {
598
+ return false ;
599
+ }
600
+
601
+ // Returns value of RemoveNgramChain() method. See the comments in
602
+ // implementation.
603
+ enum RemoveNgramChainResult {
604
+ DONE,
605
+ TAIL,
606
+ NOT_FOUND,
607
+ };
608
+
609
+ std::vector<absl::string_view> key_ngrams, value_ngrams;
610
+ std::function<RemoveNgramChainResult (Entry*, size_t , size_t )>
611
+ remove_ngram_chain_internal =
612
+ [&](Entry* entry, size_t key_ngrams_len,
613
+ size_t value_ngrams_len) -> RemoveNgramChainResult {
614
+ // Updates the lengths with the current entry node.
615
+ key_ngrams_len += entry->key ().size ();
616
+ value_ngrams_len += entry->value ().size ();
617
+
618
+ // This is the case where ngram key and value are shorter than the target
619
+ // key and value, respectively. In this case, we need to find further
620
+ // entries to concatenate in order to make |target_key| and |target_value|.
621
+ if (key_ngrams_len < target_key.size () &&
622
+ value_ngrams_len < target_value.size ()) {
623
+ key_ngrams.push_back (entry->key ());
624
+ value_ngrams.push_back (entry->value ());
625
+ for (const uint64_t fp : entry->next_entry_fps ()) {
626
+ Entry* e = dic_->MutableLookupWithoutInsert (fp);
627
+ if (e == nullptr ) {
628
+ continue ;
629
+ }
630
+ switch (
631
+ remove_ngram_chain_internal (e, key_ngrams_len, value_ngrams_len)) {
632
+ case DONE:
633
+ return DONE;
634
+ case TAIL:
635
+ // |entry| is the second-to-the-last node. So cut the link to the
636
+ // child entry.
637
+ EraseNextEntries (fp, entry);
638
+ return DONE;
639
+ default :
640
+ break ;
641
+ }
618
642
}
619
- const RemoveNgramChainResult r =
620
- RemoveNgramChain (target_key, target_value, e, key_ngrams,
621
- key_ngrams_len, value_ngrams, value_ngrams_len);
622
- switch (r) {
623
- case RemoveNgramChainResult::DONE:
624
- return RemoveNgramChainResult::DONE;
625
- case RemoveNgramChainResult::TAIL:
626
- // |entry| is the second-to-the-last node. So cut the link to the
627
- // child entry.
628
- EraseNextEntries (fp, entry);
629
- return RemoveNgramChainResult::DONE;
630
- default :
631
- break ;
643
+ // Recovers the state.
644
+ key_ngrams.pop_back ();
645
+ value_ngrams.pop_back ();
646
+ return NOT_FOUND;
647
+ }
648
+
649
+ // This is the case where the current ngram key and value have the same
650
+ // lengths as those of |target_key| and |target_value|, respectively.
651
+ if (key_ngrams_len == target_key.size () &&
652
+ value_ngrams_len == target_value.size ()) {
653
+ key_ngrams.push_back (entry->key ());
654
+ value_ngrams.push_back (entry->value ());
655
+ const std::string ngram_key = absl::StrJoin (key_ngrams, " " );
656
+ const std::string ngram_value = absl::StrJoin (value_ngrams, " " );
657
+ if (ngram_key == target_key && ngram_value == target_value) {
658
+ // |entry| is the last node. So return TAIL to tell the caller so that
659
+ // it can remove the link to this last node.
660
+ return TAIL;
632
661
}
662
+ key_ngrams.pop_back ();
663
+ value_ngrams.pop_back ();
664
+ return NOT_FOUND;
633
665
}
634
- // Recovers the state.
635
- key_ngrams->pop_back ();
636
- value_ngrams->pop_back ();
637
- return RemoveNgramChainResult::NOT_FOUND;
666
+
667
+ return NOT_FOUND;
668
+ };
669
+
670
+ return remove_ngram_chain_internal (entry, 0 , 0 ) == DONE;
671
+ }
672
+
673
+ // static
674
+ bool UserHistoryPredictor::RemoveEntryWithInnerSegment (absl::string_view key,
675
+ absl::string_view value,
676
+ Entry* entry) {
677
+ DCHECK (entry);
678
+
679
+ if (entry->inner_segment_boundary_size () == 0 ) {
680
+ return false ;
638
681
}
639
682
640
- // This is the case where the current ngram key and value have the same
641
- // lengths as those of |target_key| and |target_value|, respectively.
642
- if (key_ngrams_len == target_key.size () &&
643
- value_ngrams_len == target_value.size ()) {
644
- key_ngrams->push_back (entry->key ());
645
- value_ngrams->push_back (entry->value ());
646
- const std::string ngram_key = absl::StrJoin (*key_ngrams, " " );
647
- const std::string ngram_value = absl::StrJoin (*value_ngrams, " " );
648
- if (ngram_key == target_key && ngram_value == target_value) {
649
- // |entry| is the last node. So return TAIL to tell the caller so
650
- // that it can remove the link to this last node.
651
- return RemoveNgramChainResult::TAIL;
683
+ // TODO(taku): Support multiple matches.
684
+
685
+ // Checks from value as value is less likely matched.
686
+ const auto value_start = absl::string_view (entry->value ()).find (value);
687
+ if (value_start == absl::string_view::npos) return false ;
688
+
689
+ const auto key_start = absl::string_view (entry->key ()).find (key);
690
+ if (key_start == absl::string_view::npos) return false ;
691
+
692
+ const converter::InnerSegments inner_segments (
693
+ entry->key (), entry->value (), entry->inner_segment_boundary ());
694
+
695
+ const uint32_t key_end = key_start + key.size ();
696
+ const uint32_t value_end = value_start + value.size ();
697
+
698
+ uint32_t key_len = 0 ;
699
+ uint32_t value_len = 0 ;
700
+ bool start_match = false ;
701
+
702
+ for (const auto & it : inner_segments) {
703
+ if (key_len == key_start && value_len == value_start) {
704
+ start_match = true ;
705
+ }
706
+
707
+ // Ends at the key/value or content_key/value boundary.
708
+ if ((key_len + it.GetKey ().size () == key_end &&
709
+ value_len + it.GetValue ().size () == value_end) ||
710
+ (key_len + it.GetContentKey ().size () == key_end &&
711
+ value_len + it.GetContentValue ().size () == value_end)) {
712
+ if (start_match) {
713
+ entry->set_suggestion_freq (0 );
714
+ entry->set_shown_freq (0 );
715
+ entry->set_removed (true );
716
+ return true ;
717
+ }
718
+ return false ;
719
+ }
720
+
721
+ key_len += it.GetKey ().size ();
722
+ value_len += it.GetValue ().size ();
723
+
724
+ // Exceeds the boundary.
725
+ if (!start_match && (key_len > key_start || value_len > value_start)) {
726
+ return false ;
652
727
}
653
- key_ngrams->pop_back ();
654
- value_ngrams->pop_back ();
655
- return RemoveNgramChainResult::NOT_FOUND;
656
728
}
657
729
658
- return RemoveNgramChainResult::NOT_FOUND ;
730
+ return false ;
659
731
}
660
732
661
733
bool UserHistoryPredictor::ClearHistoryEntry (const absl::string_view key,
@@ -674,26 +746,26 @@ bool UserHistoryPredictor::ClearHistoryEntry(const absl::string_view key,
674
746
deleted = true ;
675
747
}
676
748
}
749
+
677
750
{
678
- // Finds a chain of history entries that produces key and value. If exists,
679
- // remove the link so that N-gram history prediction never generates this
680
- // key value pair..
751
+ // Two case
752
+ // 1) Finds a chain of history entries that produces key and value.
753
+ // If exists, remove the link so that N-gram history prediction never
754
+ // generates this key value pair.
755
+ // 2) key/value are the substring of entry.
681
756
for (DicElement& elm : *dic_) {
682
757
Entry* entry = &elm.value ;
683
- if (!key.starts_with (entry->key ()) ||
684
- !value.starts_with (entry->value ())) {
685
- continue ;
686
- }
687
- std::vector<absl::string_view> key_ngrams, value_ngrams;
688
- if (RemoveNgramChain (key, value, entry, &key_ngrams, 0 , &value_ngrams,
689
- 0 ) == RemoveNgramChainResult::DONE) {
758
+ if (RemoveNgramChain (key, value, entry) ||
759
+ RemoveEntryWithInnerSegment (key, value, entry)) {
690
760
deleted = true ;
691
761
}
692
762
}
693
763
}
764
+
694
765
if (deleted) {
695
766
updated_ = true ;
696
767
}
768
+
697
769
return deleted;
698
770
}
699
771
0 commit comments