Skip to content

Commit

Permalink
Lookalikes: Remove the suspicious keyword heuristic
Browse files Browse the repository at this point in the history
This heuristic is superseded by the Combo Squatting heuristic. This CL
removes dead code and adjusts tests accordingly.

A follow up CL should remove the directory that generates the static
data for this heuristic
(components/url_formatter/spoof_checks/common_words/).

Bug: 1394573, 1386300
Change-Id: Iaa2e959a876fad0aff03fd9591f70fd57aa366f9
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4054821
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Reviewed-by: Joe DeBlasio <jdeblasio@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1077631}
  • Loading branch information
meacer authored and Chromium LUCI CQ committed Nov 30, 2022
1 parent 3db73ed commit 905be78
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 416 deletions.
21 changes: 2 additions & 19 deletions chrome/browser/lookalikes/lookalike_test_helper.cc
Expand Up @@ -4,24 +4,17 @@

#include "chrome/browser/lookalikes/lookalike_test_helper.h"

#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/reputation/reputation_service.h"
#include "chrome/browser/ui/browser.h"
#include "components/lookalikes/core/lookalike_url_util.h"
#include "components/reputation/core/safety_tip_test_utils.h"
#include "components/reputation/core/safety_tips_config.h"
#include "components/ukm/test_ukm_recorder.h"
#include "components/url_formatter/spoof_checks/idn_spoof_checker.h"
#include "components/url_formatter/spoof_checks/top_domains/test_top500_domains.h"

namespace test {
#include "components/url_formatter/spoof_checks/top_domains/browsertest_domains-trie-inc.cc"
}

LookalikeTestHelper::LookalikeTestHelper(Browser* browser)
: browser_(browser) {}

void LookalikeTestHelper::SetUp() {
void SetUpLookalikeTestParams() {
// Use test top domain lists instead of the actual list.
url_formatter::IDNSpoofChecker::HuffmanTrieParams trie_params{
test::kTopDomainsHuffmanTree, sizeof(test::kTopDomainsHuffmanTree),
Expand All @@ -35,20 +28,10 @@ void LookalikeTestHelper::SetUp() {
test_top500_domains::kNumTop500EditDistanceSkeletons};
SetTop500DomainsParamsForTesting(top500_params);

// Use test keywords instead of the actual list. This isn't strictly
// necessary as this test doesn't use reputation service, but it's good
// practice.
ReputationService* rep_service = ReputationService::Get(browser_->profile());
rep_service->SetSensitiveKeywordsForTesting(
test_top500_domains::kTopKeywords, test_top500_domains::kNumTopKeywords);

reputation::InitializeSafetyTipConfig();
}

void LookalikeTestHelper::TearDown() {
void TearDownLookalikeTestParams() {
url_formatter::IDNSpoofChecker::RestoreTrieParamsForTesting();
ResetTop500DomainsParamsForTesting();

ReputationService* rep_service = ReputationService::Get(browser_->profile());
rep_service->ResetSensitiveKeywordsForTesting();
}
22 changes: 5 additions & 17 deletions chrome/browser/lookalikes/lookalike_test_helper.h
Expand Up @@ -5,22 +5,10 @@
#ifndef CHROME_BROWSER_LOOKALIKES_LOOKALIKE_TEST_HELPER_H_
#define CHROME_BROWSER_LOOKALIKES_LOOKALIKE_TEST_HELPER_H_

#include "base/memory/raw_ptr.h"
#include "chrome/test/base/in_process_browser_test.h"

class Browser;

// A class that provides helper methods for interstitial and safety tips
// lookalike tests.
class LookalikeTestHelper {
public:
explicit LookalikeTestHelper(Browser* browser);

void SetUp();
void TearDown();

private:
raw_ptr<Browser> browser_;
};
// Helper methods for interstitial and safety tips lookalike tests.
// These allow the tests to use test data instead of prod, such as test top
// domain lists.
void SetUpLookalikeTestParams();
void TearDownLookalikeTestParams();

#endif
Expand Up @@ -250,14 +250,13 @@ class LookalikeUrlNavigationThrottleBrowserTest : public InProcessBrowserTest {
LookalikeUrlService::Get(browser()->profile());
lookalike_service->SetClockForTesting(&test_clock_);

test_helper_ = std::make_unique<LookalikeTestHelper>(browser());
test_helper_->SetUp();
SetUpLookalikeTestParams();
InProcessBrowserTest::SetUpOnMainThread();
}

void TearDownOnMainThread() override {
InProcessBrowserTest::TearDownOnMainThread();
test_helper_->TearDown();
TearDownLookalikeTestParams();
}

GURL GetURL(const char* hostname) const {
Expand Down Expand Up @@ -439,7 +438,6 @@ class LookalikeUrlNavigationThrottleBrowserTest : public InProcessBrowserTest {
base::test::ScopedFeatureList feature_list_;
std::unique_ptr<ukm::TestAutoSetUkmRecorder> test_ukm_recorder_;
base::SimpleTestClock test_clock_;
std::unique_ptr<LookalikeTestHelper> test_helper_;
};

// Navigating to a non-IDN shouldn't show an interstitial or record metrics.
Expand Down
95 changes: 0 additions & 95 deletions chrome/browser/reputation/local_heuristics.cc
Expand Up @@ -7,7 +7,6 @@
#include "base/bind.h"
#include "base/callback.h"
#include "base/metrics/field_trial_params.h"
#include "base/strings/string_split.h"
#include "chrome/browser/lookalikes/lookalike_url_blocking_page.h"
#include "chrome/browser/lookalikes/lookalike_url_navigation_throttle.h"
#include "chrome/browser/lookalikes/lookalike_url_service.h"
Expand All @@ -16,25 +15,6 @@
#include "components/lookalikes/core/features.h"
#include "components/lookalikes/core/lookalike_url_util.h"
#include "components/reputation/core/safety_tips_config.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"

namespace {

// Binary search through |words| to find |needle|.
bool SortedWordListContains(const std::string& needle,
const char* const words[],
const size_t num_words) {
// We use a custom comparator for (char *) here, to avoid the costly
// construction of two std::strings every time two values are compared,
// and because (char *) orders by address, not lexicographically.
return std::binary_search(words, words + num_words, needle.c_str(),
[](const char* str_one, const char* str_two) {
return strcmp(str_one, str_two) < 0;
});
}

} // namespace

bool ShouldTriggerSafetyTipFromLookalike(
const GURL& url,
Expand Down Expand Up @@ -129,78 +109,3 @@ bool ShouldTriggerSafetyTipFromLookalike(
NOTREACHED();
return false;
}

bool ShouldTriggerSafetyTipFromKeywordInURL(
const GURL& url,
const DomainInfo& navigated_domain,
const char* const sensitive_keywords[],
const size_t num_sensitive_keywords) {
return HostnameContainsKeyword(url, navigated_domain.domain_and_registry,
sensitive_keywords, num_sensitive_keywords,
/* search_e2ld = */ true);
}

bool HostnameContainsKeyword(const GURL& url,
const std::string& eTLD_plus_one,
const char* const keywords[],
const size_t num_keywords,
bool search_e2ld) {
// We never want to trigger this heuristic on any non-http / https sites.
if (!url.SchemeIsHTTPOrHTTPS()) {
return false;
}

// The URL's eTLD + 1 will be empty whenever we're given a host that's
// invalid.
if (eTLD_plus_one.empty()) {
return false;
}

// TODO(jdeblasio): This should use GetETLDPlusOne() from Lookalike Utils to
// benefit from de-facto-private registries.
size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
url, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);

// Getting a registry length of 0 means that our URL has an unknown registry.
if (registry_length == 0) {
return false;
}

// e2LD: effective 2nd-level domain, e.g. "google" for "www.google.co.uk".
std::string e2LD =
eTLD_plus_one.substr(0, eTLD_plus_one.size() - registry_length - 1);
// search_substr is the hostname except the eTLD (e.g. "www.google").
std::string search_substr =
url.host().substr(0, url.host().size() - registry_length - 1);

// We should never end up with a "." in our e2LD.
DCHECK_EQ(e2LD.find("."), std::string::npos);
// Any problems that would result in an empty e2LD should have been caught via
// the |eTLD_plus_one| check.

// If we want to exclude the e2LD, or if the e2LD is itself a keyword, then
// chop that off and only search the rest of it. Otherwise, we keep the full
// e2LD included to detect hyphenated spoofs (e.g. "evil-google.com").
if (!search_e2ld || SortedWordListContains(e2LD, keywords, num_keywords)) {
// If the user visited the eTLD+1 directly, bail here.
if (search_substr.size() == e2LD.size()) {
return false;
}

search_substr =
search_substr.substr(0, search_substr.size() - e2LD.size() - 1);
// e.g. search_substr goes from "www.google" -> "www".
}

const std::vector<std::string> search_parts = base::SplitString(
search_substr, ".-", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

for (const auto& part : search_parts) {
if (SortedWordListContains(part, keywords, num_keywords)) {
return true;
}
}

return false;
}
28 changes: 1 addition & 27 deletions chrome/browser/reputation/local_heuristics.h
Expand Up @@ -5,11 +5,9 @@
#ifndef CHROME_BROWSER_REPUTATION_LOCAL_HEURISTICS_H_
#define CHROME_BROWSER_REPUTATION_LOCAL_HEURISTICS_H_

#include <cstddef>
#include <string>
#include <vector>

#include "chrome/browser/lookalikes/lookalike_url_service.h"
#include "components/lookalikes/core/lookalike_url_util.h"
#include "url/gurl.h"

// Checks to see whether a given URL qualifies as a lookalike domain, and thus
Expand All @@ -25,28 +23,4 @@ bool ShouldTriggerSafetyTipFromLookalike(
const std::vector<DomainInfo>& engaged_sites,
GURL* safe_url);

// Checks to see whether a given URL contains sensitive keywords in a way
// that it should trigger a safety tip. This is a thin wrapper around
// HostnameContainsKeyword.
//
// Returns false when called with a URL without a TLD or with an unknown TLD.
bool ShouldTriggerSafetyTipFromKeywordInURL(
const GURL& url,
const DomainInfo& navigated_domain,
const char* const sensitive_keywords[],
const size_t num_sensitive_keywords);

// Checks to see whether a hostname contains sensitive keywords in a way
// that violates our hostname elision policy.
//
// If |search_e2ld| is false, only finds keywords in subdomains below the e2LD
// (e.g. it will only search through "foo.bar" in "foo.bar.example.com").
//
// Returns false when called with a URL without a TLD or with an unknown TLD.
bool HostnameContainsKeyword(const GURL& url,
const std::string& eTLD_plus_one,
const char* const sensitive_keywords[],
const size_t num_sensitive_keywords,
bool search_e2ld);

#endif // CHROME_BROWSER_REPUTATION_LOCAL_HEURISTICS_H_

0 comments on commit 905be78

Please sign in to comment.