Skip to content

Commit

Permalink
[AutofillDataModel] Set up generated parsing expressions structures.
Browse files Browse the repository at this point in the history
The CL introduces the first set of per country parsing expressions. It
also includes the C++ representation of parsing instructions (e.g.
decomposition cascade, extract parts).

There is some work left regarding cleaning up repeated expressions (e.g.
NAME_FULL) but it will come as a follow up.

Change-Id: I1ad618e897aee7e74464efdfcdee1f7ffcdd332f
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4798462
Code-Coverage: findit-for-me@appspot.gserviceaccount.com <findit-for-me@appspot.gserviceaccount.com>
Reviewed-by: Dominic Battre <battre@chromium.org>
Commit-Queue: Norge Vizcay <vizcay@google.com>
Cr-Commit-Position: refs/heads/main@{#1187737}
  • Loading branch information
norgevz authored and Chromium LUCI CQ committed Aug 24, 2023
1 parent 4bda68f commit 5e0ccf4
Show file tree
Hide file tree
Showing 5 changed files with 550 additions and 0 deletions.
4 changes: 4 additions & 0 deletions components/autofill/core/browser/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ static_library("browser") {
"data_model/autofill_i18n_api.h",
"data_model/autofill_i18n_formatting_expressions.h",
"data_model/autofill_i18n_hierarchies.h",
"data_model/autofill_i18n_parsing_expression_components.cc",
"data_model/autofill_i18n_parsing_expression_components.h",
"data_model/autofill_i18n_parsing_expressions.h",
"data_model/autofill_metadata.cc",
"data_model/autofill_metadata.h",
"data_model/autofill_offer_data.cc",
Expand Down Expand Up @@ -948,6 +951,7 @@ source_set("unit_tests") {
"data_model/address_unittest.cc",
"data_model/autofill_data_model_unittest.cc",
"data_model/autofill_i18n_api_unittest.cc",
"data_model/autofill_i18n_parsing_expression_components_unittest.cc",
"data_model/autofill_profile_comparator_unittest.cc",
"data_model/autofill_profile_unittest.cc",
"data_model/autofill_structured_address_component_unittest.cc",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/core/browser/data_model/autofill_i18n_parsing_expression_components.h"

#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"

namespace autofill::i18n_model_definition {
namespace {

inline std::string RemoveVersionSuffix(const std::string& token) {
return token.substr(0, token.find("__"));
}

absl::optional<base::flat_map<std::string, std::string>> ParseUsingRegex(
const std::string& value,
const std::string& pattern) {
const RE2* regex = Re2RegExCache::Instance()->GetRegEx(pattern);
if (!regex || !regex->ok()) {
return absl::nullopt;
}

// Get the number of capturing groups in the expression.
// Note, the capturing group for the full match is not counted.
size_t number_of_capturing_groups = regex->NumberOfCapturingGroups() + 1;

// Create result vectors to get the matches for the capturing groups.
std::vector<std::string> results(number_of_capturing_groups);
std::vector<RE2::Arg> match_results(number_of_capturing_groups);
std::vector<RE2::Arg*> match_results_ptr(number_of_capturing_groups);

// Note, the capturing group for the full match is not counted by
// |NumberOfCapturingGroups|.
for (size_t i = 0; i < number_of_capturing_groups; ++i) {
match_results[i] = &results[i];
match_results_ptr[i] = &match_results[i];
}

// One capturing group is not counted since it holds the full match.
if (!RE2::PartialMatchN(value, *regex, match_results_ptr.data(),
number_of_capturing_groups - 1)) {
return absl::nullopt;
}

// If successful, write the values into the results map.
// Note, the capturing group for the full match creates an off-by-one scenario
// in the indexing.
std::vector<std::pair<std::string, std::string>> matches;
for (const auto& group : regex->NamedCapturingGroups()) {
const auto& [name, index] = group;
if (results[index - 1].empty()) {
continue;
}
matches.emplace_back(RemoveVersionSuffix(name), results[index - 1]);
}

return base::MakeFlatMap<std::string, std::string>(std::move(matches));
}

// Check that the condition regex is matched if exist.
bool ConditionIsMatched(const std::string& condition_regex,
const std::string& value) {
if (condition_regex.empty()) {
return true;
}
const RE2* regex = Re2RegExCache::Instance()->GetRegEx(condition_regex);
return RE2::PartialMatch(value, *regex);
}
} // namespace

ValueParsingResults Decomposition::Parse(const std::string& value) const {
std::string prefix = anchor_beginning_ ? "^" : "";
std::string suffix = anchor_end_ ? "$" : "";
std::string regex = prefix + parsing_regex_ + suffix;
return ParseUsingRegex(value, regex);
}

ValueParsingResults DecompositionCascade::Parse(
const std::string& value) const {
if (!ConditionIsMatched(condition_regex_, value)) {
return absl::nullopt;
}

for (const auto* alternative : alternatives_) {
auto result = alternative->Parse(value);
if (result.has_value()) {
return result;
}
}
return absl::nullopt;
}

ValueParsingResults ExtractPart::Parse(const std::string& value) const {
if (!ConditionIsMatched(condition_regex_, value)) {
return absl::nullopt;
}

return ParseUsingRegex(value, parsing_regex_);
}

ValueParsingResults ExtractParts::Parse(const std::string& value) const {
if (!ConditionIsMatched(condition_regex_, value)) {
return absl::nullopt;
}
base::flat_map<std::string, std::string> result;
for (const auto* piece : pieces_) {
auto piece_match = piece->Parse(value);
if (piece_match.has_value()) {
for (const auto& [field_type_str, matched_string] : *piece_match) {
result.insert_or_assign(field_type_str, matched_string);
}
}
}
if (!result.empty()) {
return result;
}
return absl::nullopt;
}

} // namespace autofill::i18n_model_definition
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_I18N_PARSING_EXPRESSION_COMPONENTS_H_
#define COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_I18N_PARSING_EXPRESSION_COMPONENTS_H_

#include "base/containers/flat_map.h"
#include "base/containers/span.h"
#include "base/memory/raw_ptr.h"
#include "components/autofill/core/browser/field_types.h"

namespace autofill::i18n_model_definition {

// Results of a parsing operation. If parsing was successful,
// contains the matching results, keyed by the name of the capture group with
// the captured substrings as the value. Otherwise this is a `nullopt`.
using ValueParsingResults =
absl::optional<base::flat_map<std::string, std::string>>;

// An AutofillParsingProcess is a structure that represents a parsing process
// that transforms unstructured data model values into structured information.
// Each implementation of this class expresses a different parsing logic by
// defining its own implementation of the `Parse` method.
// As an example, a parsing process can transform an address text like:
// “Avenida Mem de Sá, 1234
// apto 12
// 1 andar
// referência: foo”
// Into structured information:
// ADDRESS_HOME_STREET_NAME: "Avenida Mem de Sá"
// ADDRESS_HOME_HOUSE_NUMBER: "1234"
// ADDRESS_HOME_APT_NUM: "apto 12"
// ADDRESS_HOME_FLOOR: "1"
// ADDRESS_HOME_LANDMARK: "foo"
class AutofillParsingProcess {
public:
constexpr AutofillParsingProcess() = default;
AutofillParsingProcess(const AutofillParsingProcess& other) = delete;
AutofillParsingProcess& operator=(const AutofillParsingProcess& right) =
delete;
virtual ~AutofillParsingProcess() = default;

// Parses `value` and returns the extracted field type matches.
virtual ValueParsingResults Parse(const std::string& value) const = 0;
};

// A Decomposition parsing process attempts to match an entire string (unless
// anchor_beginning or anchor_end create exceptions) to a parsing expression,
// and then extracts the captured field type values.
class Decomposition : public AutofillParsingProcess {
public:
constexpr Decomposition(std::string parsing_regex,
bool anchor_beginning,
bool anchor_end)
: parsing_regex_(std::move(parsing_regex)),
anchor_beginning_(anchor_beginning),
anchor_end_(anchor_end) {}
Decomposition(const Decomposition&) = delete;
Decomposition& operator=(const Decomposition&) = delete;
~Decomposition() override = default;

ValueParsingResults Parse(const std::string& value) const override;

private:
const std::string parsing_regex_;
const bool anchor_beginning_ = true;
const bool anchor_end_ = true;
};

// A DecompositionCascade enables us to try one Decomposition after the next
// until we have found a match. It can be fitted with a condition to only use it
// in case the condition is fulfilled. The lack of a condition is expressed by
// an empty string.
class DecompositionCascade : public AutofillParsingProcess {
public:
// Note that `alternatives` need to survive the lifetime of the
// DecompositionCascade.
constexpr DecompositionCascade(
std::string condition_regex,
base::span<const AutofillParsingProcess* const> alternatives)
: condition_regex_(std::move(condition_regex)),
alternatives_(alternatives) {}
DecompositionCascade(const DecompositionCascade&) = delete;
DecompositionCascade& operator=(const DecompositionCascade&) = delete;
~DecompositionCascade() override = default;

ValueParsingResults Parse(const std::string& value) const override;

private:
const std::string condition_regex_;
const base::span<const AutofillParsingProcess* const> alternatives_;
};

// An ExtractPart parsing process attempts to match a string to a
// parsing expression, and then extracts the captured field type values. It can
// be fitted with a condition to only use it in case the condition is fulfilled.
// The lack of a condition is expressed by an empty string.
// While a Decomposition attempts to match the entire string, ExtractPart is
// designed to contains an anchor term (e.g. "Apt.") after which information
// should be extracted (the apartment number).
class ExtractPart : public AutofillParsingProcess {
public:
constexpr ExtractPart(std::string condition_regex, std::string parsing_regex)
: condition_regex_(std::move(condition_regex)),
parsing_regex_(std::move(parsing_regex)) {}

ExtractPart(const ExtractPart&) = delete;
ExtractPart& operator=(const ExtractPart&) = delete;
~ExtractPart() override = default;

ValueParsingResults Parse(const std::string& value) const override;

private:
const std::string condition_regex_;
const std::string parsing_regex_;
};

// Unlike for a DecompositionCascade, ExtractParts does not follow the "the
// first match wins" principle but applies all matching attempts in sequence so
// the last match wins. This also enables extracting different data (e.g. an
// apartment and a floor) in a sequence of ExtractPart operations. It can also
// be fitted with a condition to only use it in case the condition is fulfilled.
// The lack of a condition is expressed by an empty string.
class ExtractParts : public AutofillParsingProcess {
public:
// Note that `pieces` need to survive the lifetime of the ExtractParts.
constexpr ExtractParts(std::string condition_regex,
base::span<const ExtractPart* const> pieces)
: condition_regex_(std::move(condition_regex)), pieces_(pieces) {}
ExtractParts(const ExtractParts&) = delete;
ExtractParts& operator=(const ExtractParts&) = delete;
~ExtractParts() override = default;

ValueParsingResults Parse(const std::string& value) const override;

private:
const std::string condition_regex_;
const base::span<const ExtractPart* const> pieces_;
};

} // namespace autofill::i18n_model_definition

#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_I18N_PARSING_EXPRESSION_COMPONENTS_H_
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/core/browser/data_model/autofill_i18n_parsing_expression_components.h"

#include <vector>

#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"

using testing::ElementsAre;
using testing::Eq;
using testing::Optional;
using testing::Pair;
using testing::UnorderedElementsAre;

namespace autofill::i18n_model_definition {

TEST(AutofillI18nParsingStructures, Decomposition) {
Decomposition decomposition("(?P<foo>\\w+)", true, true);
EXPECT_THAT(decomposition.Parse("aaa"),
Optional(ElementsAre(Pair("foo", "aaa"))));

EXPECT_THAT(decomposition.Parse("aaa aaa"), Eq(absl::nullopt));
}

TEST(AutofillI18nParsingStructures, DecompositionAnchoringDisabled) {
Decomposition decomposition("(?P<foo>\\w+)", false, false);

EXPECT_THAT(decomposition.Parse("aaa"),
Optional(ElementsAre(Pair("foo", "aaa"))));

// If anchoring is disabled we do match the regex in this case.
EXPECT_THAT(decomposition.Parse("aaa aaa"),
Optional(ElementsAre(Pair("foo", "aaa"))));
}

TEST(AutofillI18nParsingStructures, DecompositionCascade) {
Decomposition decomposition1("(?P<foo>.*a+)", true, true);
Decomposition decomposition2("(?P<foo__2>.*b+)", true, true);
const std::vector<const AutofillParsingProcess*> alternatives = {
&decomposition1, &decomposition2};
DecompositionCascade cascade("^1", alternatives);

EXPECT_THAT(cascade.Parse("1aaa"),
Optional(ElementsAre(Pair("foo", "1aaa"))));

// It also checks whether the version suffix is removed.
EXPECT_THAT(cascade.Parse("1bbb"),
Optional(ElementsAre(Pair("foo", "1bbb"))));

// The condition (a "1" at the beginning is violated), therefore, we don't
// return anything.
EXPECT_THAT(cascade.Parse("bbb"), Eq(absl::nullopt));
}

TEST(AutofillI18nParsingStructures, ExtractPart) {
ExtractPart extract_part("^1", "(?:prefix(?P<NAME_MIDDLE>[_a]+)suffix)");

EXPECT_THAT(extract_part.Parse("1prefix_a_suffix"),
Optional(ElementsAre(Pair("NAME_MIDDLE", "_a_"))));
}

TEST(AutofillI18nParsingStructures, ExtractParts) {
ExtractPart part1("",
"(?:house number\\s+(?P<ADDRESS_HOME_HOUSE_NUMBER>\\d+))");
ExtractPart part2("", "(?:apartment\\s+(?P<ADDRESS_HOME_APT_NUM>\\d+))");
const std::vector<const ExtractPart*> parts_ptr = {&part1, &part2};
ExtractParts extract_parts("2$", parts_ptr);

EXPECT_THAT(
extract_parts.Parse("1 house number 1 apartment 2"),
Optional(UnorderedElementsAre(Pair("ADDRESS_HOME_HOUSE_NUMBER", "1"),
Pair("ADDRESS_HOME_APT_NUM", "2"))));
}

TEST(AutofillI18nParsingStructures, RemoveVersionSuffix) {
ExtractPart part("",
"(?:floor\\s+(?P<ADDRESS_HOME_FLOOR__1>\\d+)|(?P<ADDRESS_"
"HOME_FLOOR__2>\\d+)(?:st|nd|rd|th) floor)");

EXPECT_THAT(part.Parse("3rd floor"),
Optional(ElementsAre(Pair("ADDRESS_HOME_FLOOR", "3"))));
EXPECT_THAT(part.Parse("floor 4"),
Optional(ElementsAre(Pair("ADDRESS_HOME_FLOOR", "4"))));
}

} // namespace autofill::i18n_model_definition

0 comments on commit 5e0ccf4

Please sign in to comment.