Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ issue: {ml-issue}724[#724].)
* Fixes potential memory corruption when determining seasonality. (See {ml-pull}852[#852].)
* Prevent prediction_field_name clashing with other fields in ml results.
(See {ml-pull}861[#861].)

* Include out-of-order as well as in-order terms in categorization reverse searches.
(See {ml-pull}950[#950], issue: {ml-issue}949[#949].)

== {es} version 7.5.2

Expand Down
20 changes: 9 additions & 11 deletions include/model/CTokenListReverseSearchCreator.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ class MODEL_EXPORT CTokenListReverseSearchCreator : public CTokenListReverseSear
bool createNullSearch(std::string& part1, std::string& part2) const override;

//! If possible, create a reverse search for the case where there are no
//! unique tokens identifying the type. (If this is not possible return
//! unique tokens identifying the category. (If this is not possible return
//! false.)
bool createNoUniqueTokenSearch(int type,
bool createNoUniqueTokenSearch(int categoryId,
const std::string& example,
size_t maxMatchingStringLen,
std::string& part1,
Expand All @@ -55,27 +55,25 @@ class MODEL_EXPORT CTokenListReverseSearchCreator : public CTokenListReverseSear
//! Initialise the two strings that form a reverse search. For example,
//! this could be as simple as clearing the strings or setting them to
//! some sort of one-off preamble.
void initStandardSearch(int type,
void initStandardSearch(int categoryId,
const std::string& example,
size_t maxMatchingStringLen,
std::string& part1,
std::string& part2) const override;

//! Modify the two strings that form a reverse search to account for the
//! specified token, which may occur anywhere within the original
//! message, but has been determined to be a good thing to distinguish
//! this type of messages from other types.
void addCommonUniqueToken(const std::string& token,
std::string& part1,
std::string& part2) const override;

//! Modify the two strings that form a reverse search to account for the
//! specified token.
void addInOrderCommonToken(const std::string& token,
bool first,
std::string& part1,
std::string& part2) const override;

//! Modify the two strings that form a reverse search to account for the
//! specified token, which may occur anywhere within the original
//! message, but has been determined to be a good thing to distinguish
//! this category of messages from other categories.
void addOutOfOrderCommonToken(const std::string&, std::string&, std::string&) const override;

//! Close off the two strings that form a reverse search. For example,
//! this may be when closing brackets need to be appended.
void closeStandardSearch(std::string& part1, std::string& part2) const override;
Expand Down
16 changes: 8 additions & 8 deletions include/model/CTokenListReverseSearchCreatorIntf.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,21 @@ class MODEL_EXPORT CTokenListReverseSearchCreatorIntf {
std::string& part1,
std::string& part2) const = 0;

//! Modify the two strings that form a reverse search to account for the
//! specified token, which may occur anywhere within the original
//! message, but has been determined to be a good thing to distinguish
//! this category of messages from other categories.
virtual void addCommonUniqueToken(const std::string& token,
std::string& part1,
std::string& part2) const = 0;

//! Modify the two strings that form a reverse search to account for the
//! specified token.
virtual void addInOrderCommonToken(const std::string& token,
bool first,
std::string& part1,
std::string& part2) const = 0;

//! Modify the two strings that form a reverse search to account for the
//! specified token, which may occur anywhere within the original
//! message, but has been determined to be a good thing to distinguish
//! this category of messages from other categories.
virtual void addOutOfOrderCommonToken(const std::string& token,
std::string& part1,
std::string& part2) const = 0;

//! Close off the two strings that form a reverse search. For example,
//! this may be when closing brackets need to be appended.
virtual void closeStandardSearch(std::string& part1, std::string& part2) const;
Expand Down
47 changes: 47 additions & 0 deletions lib/api/unittest/CFieldDataCategorizerTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,53 @@ BOOST_AUTO_TEST_CASE(testNodeReverseSearch) {
BOOST_TEST_REQUIRE(output.find("\"message\"") == std::string::npos);
}

BOOST_AUTO_TEST_CASE(testJobKilledReverseSearch) {
model::CLimits limits;
CFieldConfig config;
BOOST_TEST_REQUIRE(config.initFromFile("testfiles/new_persist_categorization.conf"));

std::ostringstream outputStrm;
{
CNullOutput nullOutput;
core::CJsonOutputStreamWrapper wrappedOutputStream(outputStrm);
CJsonOutputWriter writer("job", wrappedOutputStream);

CFieldDataCategorizer categorizer("job", config, limits, nullOutput, writer);

CFieldDataCategorizer::TStrStrUMap dataRowFields;
dataRowFields["message"] = "[count_tweets] Killing job";

BOOST_TEST_REQUIRE(categorizer.handleRecord(dataRowFields));

dataRowFields["message"] = "Killing job [count_tweets]";

BOOST_TEST_REQUIRE(categorizer.handleRecord(dataRowFields));

dataRowFields["message"] = "[tweets_by_location] Killing job";

BOOST_TEST_REQUIRE(categorizer.handleRecord(dataRowFields));

dataRowFields["message"] = "Killing job [tweets_by_location]";

BOOST_TEST_REQUIRE(categorizer.handleRecord(dataRowFields));

categorizer.finalise();
}

const std::string& output = outputStrm.str();
LOG_DEBUG(<< "Output is: " << output);

// Assert that the reverse search contains all expected tokens when
// categorization is run end-to-end (obviously computation of categories and
// reverse search creation are tested more thoroughly in the unit tests for
// their respective classes, but this test helps to confirm that they work
// together)
BOOST_TEST_REQUIRE(output.find("\"terms\":\"Killing job\"") != std::string::npos);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fix of this PR makes terms here Killing job where before this PR it would have been empty.

BOOST_TEST_REQUIRE(output.find("\"regex\":\".*\"") != std::string::npos);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The regex is still poor, .*, after this PR. A followup that will not be backported as far will fix this.

See #949 (comment) for details.

// The input data should NOT be in the output
BOOST_TEST_REQUIRE(output.find("\"message\"") == std::string::npos);
}

BOOST_AUTO_TEST_CASE(testPassOnControlMessages) {
model::CLimits limits;
CFieldConfig config;
Expand Down
24 changes: 12 additions & 12 deletions lib/model/CTokenListDataCategorizerBase.cc
Original file line number Diff line number Diff line change
Expand Up @@ -297,20 +297,20 @@ bool CTokenListDataCategorizerBase::createReverseSearch(int categoryId,
category.maxMatchingStringLen(),
part1, part2);

for (auto costedCommonUniqueTokenId : costedCommonUniqueTokenIds) {
m_ReverseSearchCreator->addCommonUniqueToken(
m_TokenIdLookup[costedCommonUniqueTokenId].str(), part1, part2);
}

bool first(true);
size_t end(category.outOfOrderCommonTokenIndex());
for (size_t index = 0; index < end; ++index) {
size_t tokenId(baseTokenIds[index].first);
bool firstInOrderToken{true};
std::size_t endOfOrdered{category.outOfOrderCommonTokenIndex()};
for (std::size_t index = 0; index < baseTokenIds.size(); ++index) {
std::size_t tokenId(baseTokenIds[index].first);
if (costedCommonUniqueTokenIds.find(tokenId) !=
costedCommonUniqueTokenIds.end()) {
m_ReverseSearchCreator->addInOrderCommonToken(
m_TokenIdLookup[tokenId].str(), first, part1, part2);
first = false;
if (index < endOfOrdered) {
m_ReverseSearchCreator->addInOrderCommonToken(
m_TokenIdLookup[tokenId].str(), firstInOrderToken, part1, part2);
firstInOrderToken = false;
} else {
m_ReverseSearchCreator->addOutOfOrderCommonToken(
m_TokenIdLookup[tokenId].str(), part1, part2);
}
}
}

Expand Down
14 changes: 9 additions & 5 deletions lib/model/CTokenListReverseSearchCreator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ void CTokenListReverseSearchCreator::initStandardSearch(int /*categoryId*/,
part2.clear();
}

void CTokenListReverseSearchCreator::addCommonUniqueToken(const std::string& /*token*/,
std::string& /*part1*/,
std::string& /*part2*/) const {
}

void CTokenListReverseSearchCreator::addInOrderCommonToken(const std::string& token,
bool first,
std::string& part1,
Expand All @@ -75,6 +70,15 @@ void CTokenListReverseSearchCreator::addInOrderCommonToken(const std::string& to
part2 += core::CRegex::escapeRegexSpecial(token);
}

void CTokenListReverseSearchCreator::addOutOfOrderCommonToken(const std::string& token,
std::string& part1,
std::string& /*part2*/) const {
if (part1.empty() == false) {
part1 += ' ';
}
part1 += token;
}

void CTokenListReverseSearchCreator::closeStandardSearch(std::string& /*part1*/,
std::string& part2) const {
part2 += ".*";
Expand Down
27 changes: 14 additions & 13 deletions lib/model/unittest/CTokenListReverseSearchCreatorTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,6 @@ BOOST_AUTO_TEST_CASE(testInitStandardSearch) {
BOOST_REQUIRE_EQUAL(std::string(""), reverseSearchPart2);
}

BOOST_AUTO_TEST_CASE(testAddCommonUniqueToken) {
CTokenListReverseSearchCreator reverseSearchCreator("foo");

std::string reverseSearchPart1;
std::string reverseSearchPart2;

reverseSearchCreator.addCommonUniqueToken("user", reverseSearchPart1, reverseSearchPart2);
reverseSearchCreator.addCommonUniqueToken("logged", reverseSearchPart1, reverseSearchPart2);

BOOST_REQUIRE_EQUAL(std::string(""), reverseSearchPart1);
BOOST_REQUIRE_EQUAL(std::string(""), reverseSearchPart2);
}

BOOST_AUTO_TEST_CASE(testAddInOrderCommonToken) {
CTokenListReverseSearchCreator reverseSearchCreator("foo");

Expand All @@ -90,6 +77,20 @@ BOOST_AUTO_TEST_CASE(testAddInOrderCommonToken) {
reverseSearchPart2);
}

BOOST_AUTO_TEST_CASE(testAddOutOfOrderCommonToken) {
CTokenListReverseSearchCreator reverseSearchCreator("foo");

std::string reverseSearchPart1;
std::string reverseSearchPart2;

reverseSearchCreator.addOutOfOrderCommonToken("user", reverseSearchPart1, reverseSearchPart2);
reverseSearchCreator.addOutOfOrderCommonToken("logged", reverseSearchPart1,
reverseSearchPart2);

BOOST_REQUIRE_EQUAL(std::string("user logged"), reverseSearchPart1);
BOOST_REQUIRE_EQUAL(std::string(""), reverseSearchPart2);
}

BOOST_AUTO_TEST_CASE(testCloseStandardSearch) {
CTokenListReverseSearchCreator reverseSearchCreator("foo");

Expand Down