Skip to content

Commit

Permalink
[RQ] Extracts Most Repeated Search Terms for use in Most Visited tiles
Browse files Browse the repository at this point in the history
Segments and Keyword Search Terms which power the most visited sites
and the zero-prefix suggestions in the omnibox respectively, are
queried from different tables in the database and scored differently.

In order to organically show the most repeated search terms along with
the most visited segments/sites in the MV tiles, the search terms and
the segments can continue to be queried differently, but scored and
ranked consistently to be ultimately merged into a single list of tiles.

Segments are currently ranked using a version of frecency score which
is accumulated across timeslots (i.e., visit days) given the visit
count for the segment in those timeslots. To achieve a consistent score
for the search terms, this CL introduces,

1) A simple utility enumerator class which is initialized by the
   URLDatabase to enumerate KeywordSearchTermVisits ordered first by
   |normalized_search_term| then |last_visit_time| in ascending order.

2) A helper class that uses the enumerator to group the visits to
   unique search terms into timeslots and produce the final score for
   the search terms accumulated across the timeslots.

Note that having the visits returned by the enumerator ordered first
by the search term allows the helper class to efficiently aggregate
the visits to unique search terms. Also having the visits ordered then
from the oldest to the newest helps ensure the metadata associated with
the most recent visit such as the |last_visit_time| are present in the
aggregated search terms produced by the helper. In the future, the
enumerator may be used by other helper classes to produce prefix and
zero-prefix suggestions in the omnibox.

Bug: 1317829
Change-Id: Id1aee371fb58a79bd000f388748ea330fe4ce57c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3611078
Commit-Queue: Mohamad Ahmadi <mahmadi@chromium.org>
Reviewed-by: Scott Violet <sky@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1001651}
  • Loading branch information
Moe Ahmadi authored and Chromium LUCI CQ committed May 10, 2022
1 parent 425151a commit 0587998
Show file tree
Hide file tree
Showing 8 changed files with 404 additions and 13 deletions.
2 changes: 2 additions & 0 deletions components/history/core/browser/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ static_library("browser") {
"keyword_id.h",
"keyword_search_term.cc",
"keyword_search_term.h",
"keyword_search_term_util.cc",
"keyword_search_term_util.h",
"page_usage_data.cc",
"page_usage_data.h",
"sync/delete_directive_handler.cc",
Expand Down
41 changes: 36 additions & 5 deletions components/history/core/browser/keyword_search_term.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,33 @@

namespace history {

namespace {

// Returns a KeywordSearchTermVisit populated with the columns returned from
// |statement|. |statement| is expected to return the following columns which
// match in order and type to the fields in the KeywordSearchTermVisit less the
// score which is a calculated field.
//+----------+-----------------+-------------+-----------------+
//| term | normalized_term | visit_count | last_visit_time |
//+----------+-----------------+-------------+-----------------+
//| string16 | string16 | int | int64 |
//+----------+-----------------+-------------+-----------------+
std::unique_ptr<KeywordSearchTermVisit> KeywordSearchTermVisitFromStatement(
sql::Statement& statement) {
auto search_term = std::make_unique<KeywordSearchTermVisit>();
search_term->term = statement.ColumnString16(0);
search_term->normalized_term = statement.ColumnString16(1);
search_term->visit_count = statement.ColumnInt(2);
search_term->last_visit_time =
base::Time::FromInternalValue(statement.ColumnInt64(3));
return search_term;
}

} // namespace

KeywordSearchTermVisit::KeywordSearchTermVisit() = default;
KeywordSearchTermVisit::KeywordSearchTermVisit(
const KeywordSearchTermVisit& other) = default;
KeywordSearchTermVisit::~KeywordSearchTermVisit() = default;

double KeywordSearchTermVisit::GetFrecency(base::Time now,
Expand All @@ -20,11 +47,15 @@ double KeywordSearchTermVisit::GetFrecency(base::Time now,
return frequency_powered * recency_decayed;
}

KeywordSearchTermRow::KeywordSearchTermRow() : keyword_id(0), url_id(0) {}
// KeywordSearchTermVisitEnumerator --------------------------------------------

KeywordSearchTermRow::KeywordSearchTermRow(const KeywordSearchTermRow& other) =
default;

KeywordSearchTermRow::~KeywordSearchTermRow() {}
std::unique_ptr<KeywordSearchTermVisit>
KeywordSearchTermVisitEnumerator::GetNextVisit() {
if (initialized_ && statement_.Step()) {
return KeywordSearchTermVisitFromStatement(statement_);
}
initialized_ = false;
return nullptr;
}

} // namespace history
44 changes: 36 additions & 8 deletions components/history/core/browser/keyword_search_term.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include "base/time/time.h"
#include "components/history/core/browser/keyword_id.h"
#include "components/history/core/browser/url_row.h"
#include "sql/statement.h"
#include "third_party/abseil-cpp/absl/types/optional.h"

namespace history {

Expand All @@ -19,7 +21,8 @@ namespace history {
// visit or a set of keyword visits, depending on the overloaded functions it is
// returned from.
struct KeywordSearchTermVisit {
KeywordSearchTermVisit() = default;
KeywordSearchTermVisit();
KeywordSearchTermVisit(const KeywordSearchTermVisit& other);
~KeywordSearchTermVisit();

// Returns the frecency score of the visit based on the following formula:
Expand All @@ -38,23 +41,48 @@ struct KeywordSearchTermVisit {
std::u16string term; // The search term that was used.
std::u16string normalized_term; // The search term, in lower case and with
// extra whitespaces collapsed.
int visit_count{0}; // The visit count.
base::Time last_visit_time; // The time of the most recent visit.
int visit_count{0}; // The search term visit count.
base::Time last_visit_time; // The time of the last visit.
absl::optional<double> score; // The optional calculated frecency score.
};

// Used for URLs that have a search term associated with them.
struct KeywordSearchTermRow {
KeywordSearchTermRow();
KeywordSearchTermRow(const KeywordSearchTermRow& other);
~KeywordSearchTermRow();
KeywordSearchTermRow() = default;
KeywordSearchTermRow(const KeywordSearchTermRow& other) = default;
~KeywordSearchTermRow() = default;

KeywordID keyword_id; // ID of the keyword.
URLID url_id; // ID of the url.
KeywordID keyword_id{0}; // ID of the keyword.
URLID url_id{0}; // ID of the url.
std::u16string term; // The search term that was used.
std::u16string normalized_term; // The search term, in lower case and with
// extra whitespaces collapsed.
};

// KeywordSearchTermVisitEnumerator --------------------------------------------

// A basic enumerator to enumerate keyword search term visits. May be created
// and initialized by URLDatabase only.
class KeywordSearchTermVisitEnumerator {
public:
KeywordSearchTermVisitEnumerator(const KeywordSearchTermVisitEnumerator&) =
delete;
KeywordSearchTermVisitEnumerator& operator=(
const KeywordSearchTermVisitEnumerator&) = delete;

~KeywordSearchTermVisitEnumerator() = default;

// Returns the next search term visit or nullptr if no more visits are left.
std::unique_ptr<KeywordSearchTermVisit> GetNextVisit();

private:
friend class URLDatabase;
KeywordSearchTermVisitEnumerator() = default;

sql::Statement statement_; // The statement to create KeywordSearchTermVisit.
bool initialized_{false}; // Whether |statement_| can be executed.
};

} // namespace history

#endif // COMPONENTS_HISTORY_CORE_BROWSER_KEYWORD_SEARCH_TERM_H_
156 changes: 156 additions & 0 deletions components/history/core/browser/keyword_search_term_util.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// Copyright 2022 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/history/core/browser/keyword_search_term_util.h"

#include "base/time/time.h"
#include "components/history/core/browser/keyword_search_term.h"

namespace history {

namespace {

// Calculates the score for the given number of visits in a given day.
// Recent visits count more than historical ones, so we multiply in a boost
// depending on how long ago this day was. This boost is a curve that
// smoothly goes through these values: Today gets 3x, a week ago 2x, three
// weeks ago 1.5x, falling off to 1x at the limit of how far we reach into
// the past.
double GetMostVisitedFrecencyScore(int visit_count,
base::Time day,
base::Time now) {
double day_score = 1.0 + log(static_cast<double>(visit_count));
int days_ago = (now - day).InDays();
double recency_boost = 1.0 + (2.0 * (1.0 / (1.0 + days_ago / 7.0)));
return recency_boost * day_score;
}

// Returns whether two search terms are identical, i.e., have the same
// normalized search terms.
bool IsSameSearchTerm(const KeywordSearchTermVisit& search_term,
const KeywordSearchTermVisit& other_search_term) {
return search_term.normalized_term == other_search_term.normalized_term;
}

// Transforms a visit time to its timeslot, i.e., day of the viist.
base::Time VisitTimeToTimeslot(base::Time visit_time) {
return visit_time.LocalMidnight();
}

// Returns whether two search term visits are in the same timeslot.
bool IsSameTimeslot(const KeywordSearchTermVisit& search_term,
const KeywordSearchTermVisit& other_search_term) {
return VisitTimeToTimeslot(search_term.last_visit_time) ==
VisitTimeToTimeslot(other_search_term.last_visit_time);
}

} // namespace

// MostRepeatedSearchTermHelper ------------------------------------------------

// A helper class to return keyword search terms with frecency scores
// accumulated across days for use in the Most Visited tiles.
class MostRepeatedSearchTermHelper {
public:
MostRepeatedSearchTermHelper() = default;

MostRepeatedSearchTermHelper(const MostRepeatedSearchTermHelper&) = delete;
MostRepeatedSearchTermHelper& operator=(const MostRepeatedSearchTermHelper&) =
delete;

~MostRepeatedSearchTermHelper() = default;

// |enumerator| enumerates keyword search term visits from the URLDatabase.
// |now| is the time used to score the search term.
std::unique_ptr<KeywordSearchTermVisit> GetNextSearchTermFromEnumerator(
KeywordSearchTermVisitEnumerator& enumerator,
base::Time now) {
// |next_search_term| acts as the fast pointer and |last_search_term_| acts
// as the slow pointer accumulating the search term score across visits.
while (auto next_search_term = enumerator.GetNextVisit()) {
bool is_same_search_term =
last_search_term_ &&
IsSameSearchTerm(*next_search_term, *last_search_term_);
if (is_same_search_term &&
IsSameTimeslot(*next_search_term, *last_search_term_)) {
// We are in the same timeslot for the same search term:
// 1. Move |last_search_term_| forward.
// 2. Add up the search term visit count in the timeslot.
// 3. Carry over the search term score.
int visit_count = last_search_term_->visit_count;
double score = last_search_term_->score.value_or(0.0);
last_search_term_ = std::move(next_search_term);
last_search_term_->visit_count += visit_count;
last_search_term_->score =
last_search_term_->score.value_or(0.0) + score;

} else if (is_same_search_term) {
// We are in a new timeslot for the same search term:
// 1. Update the search term score by adding the last timeslot's score.
// 2. Move |last_search_term_| forward.
// 3. Carry over the search term score.
double score =
last_search_term_->score.value_or(0.0) +
GetMostVisitedFrecencyScore(
last_search_term_->visit_count,
VisitTimeToTimeslot(last_search_term_->last_visit_time), now);
last_search_term_ = std::move(next_search_term);
last_search_term_->score = score;

} else if (last_search_term_) {
// We encountered a new search term and |last_search_term_| has a value:
// 1. Update the search term score by adding the last timeslot's score.
// 2. Move |last_search_term_| forward.
// 3. Return the old |last_search_term_|.
double score =
last_search_term_->score.value_or(0.0) +
GetMostVisitedFrecencyScore(
last_search_term_->visit_count,
VisitTimeToTimeslot(last_search_term_->last_visit_time), now);
last_search_term_->score = score;
auto search_term_to_return = std::move(last_search_term_);
last_search_term_ = std::move(next_search_term);
return search_term_to_return;
} else {
// We encountered a new search term and |last_search_term_| has no
// value:
// 1. Move |last_search_term_| forward.
last_search_term_ = std::move(next_search_term);
}
}

// |last_search_term_| has a value:
// 1. Update the search term score by adding the last timeslot's score.
if (last_search_term_) {
double score =
last_search_term_->score.value_or(0.0) +
GetMostVisitedFrecencyScore(
last_search_term_->visit_count,
VisitTimeToTimeslot(last_search_term_->last_visit_time), now);
last_search_term_->score = score;
}

return last_search_term_ ? std::move(last_search_term_) : nullptr;
}

// The last seen search term.
std::unique_ptr<KeywordSearchTermVisit> last_search_term_;
};

void GetMostRepeatedSearchTermsFromEnumerator(
KeywordSearchTermVisitEnumerator& enumerator,
std::vector<std::unique_ptr<KeywordSearchTermVisit>>* search_terms) {
MostRepeatedSearchTermHelper helper;
const base::Time now = base::Time::Now();
while (auto search_term =
helper.GetNextSearchTermFromEnumerator(enumerator, now)) {
search_terms->push_back(std::move(search_term));
}
// Order the search terms by descending frecency scores.
std::stable_sort(
search_terms->begin(), search_terms->end(),
[](const auto& a, const auto& b) { return a->score > b->score; });
}

} // namespace history
27 changes: 27 additions & 0 deletions components/history/core/browser/keyword_search_term_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright 2022 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_HISTORY_CORE_BROWSER_KEYWORD_SEARCH_TERM_UTIL_H_
#define COMPONENTS_HISTORY_CORE_BROWSER_KEYWORD_SEARCH_TERM_UTIL_H_

#include <memory>
#include <vector>

namespace history {

class KeywordSearchTermVisitEnumerator;
struct KeywordSearchTermVisit;

// Returns keyword search terms ordered by descending frecency scores
// accumulated across days for use in the Most Visited tiles. |enumerator|
// enumerates keyword search term visits from the URLDatabase. It must return
// visits ordered first by |normalized_term| and then by |last_visit_time| in
// ascending order, i.e., from the oldest to the newest.
void GetMostRepeatedSearchTermsFromEnumerator(
KeywordSearchTermVisitEnumerator& enumerator,
std::vector<std::unique_ptr<KeywordSearchTermVisit>>* search_terms);

} // namespace history

#endif // COMPONENTS_HISTORY_CORE_BROWSER_KEYWORD_SEARCH_TERM_UTIL_H_
32 changes: 32 additions & 0 deletions components/history/core/browser/url_database.cc
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,38 @@ void URLDatabase::GetMostRecentKeywordSearchTerms(
}
}

std::unique_ptr<KeywordSearchTermVisitEnumerator>
URLDatabase::CreateKeywordSearchTermVisitEnumerator(KeywordID keyword_id,
base::Time age_threshold) {
// NOTE: the keyword_id can be zero if on first run the user does a query
// before the TemplateURLService has finished loading. As the chances of this
// occurring are small, we ignore it.
if (!keyword_id)
return nullptr;

auto enumerator = base::WrapUnique<KeywordSearchTermVisitEnumerator>(
new KeywordSearchTermVisitEnumerator());
enumerator->statement_.Assign(GetDB().GetCachedStatement(SQL_FROM_HERE,
R"(
SELECT
kst.term,
kst.normalized_term,
u.visit_count,
u.last_visit_time
FROM
keyword_search_terms kst JOIN urls u ON kst.url_id = u.id
WHERE
kst.keyword_id = ? AND
u.last_visit_time > ? AND
kst.normalized_term <> ''
ORDER BY kst.normalized_term, u.last_visit_time
)"));
enumerator->statement_.BindInt64(0, keyword_id);
enumerator->statement_.BindInt64(1, age_threshold.ToInternalValue());
enumerator->initialized_ = enumerator->statement_.is_valid();
return enumerator;
}

bool URLDatabase::DeleteKeywordSearchTerm(const std::u16string& term) {
sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
"DELETE FROM keyword_search_terms WHERE term=?"));
Expand Down
9 changes: 9 additions & 0 deletions components/history/core/browser/url_database.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class Database;

namespace history {

class KeywordSearchTermVisitEnumerator;
struct KeywordSearchTermRow;
struct KeywordSearchTermVisit;

Expand Down Expand Up @@ -231,6 +232,14 @@ class URLDatabase {
base::Time age_threshold,
std::vector<KeywordSearchTermVisit>* visits);

// Returns an enumerator to enumerate all the KeywordSearchTermVisits no older
// than `age_threshold` for the given keyword. The visits are ordered first by
// |normalized_term| and then by |last_visit_time| in ascending order, i.e.,
// from the oldest to the newest.
std::unique_ptr<KeywordSearchTermVisitEnumerator>
CreateKeywordSearchTermVisitEnumerator(KeywordID keyword_id,
base::Time age_threshold);

// Deletes all searches matching `term`.
bool DeleteKeywordSearchTerm(const std::u16string& term);

Expand Down

0 comments on commit 0587998

Please sign in to comment.