elastic · droberts195 · Jan 20, 2020 · Jan 17, 2020 · Jan 17, 2020 · Jan 20, 2020
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -76,7 +76,8 @@ issue: {ml-issue}724[#724].)
 * Fixes potential memory corruption when determining seasonality. (See {ml-pull}852[#852].)
 * Prevent prediction_field_name clashing with other fields in ml results.
 (See {ml-pull}861[#861].)
-
+* Include out-of-order as well as in-order terms in categorization reverse searches.
+(See {ml-pull}950[#950], issue: {ml-issue}949[#949].)
 
 == {es} version 7.5.2
 

diff --git a/include/model/CTokenListReverseSearchCreator.h b/include/model/CTokenListReverseSearchCreator.h
@@ -44,9 +44,9 @@ class MODEL_EXPORT CTokenListReverseSearchCreator : public CTokenListReverseSear
     bool createNullSearch(std::string& part1, std::string& part2) const override;
 
     //! If possible, create a reverse search for the case where there are no
-    //! unique tokens identifying the type.  (If this is not possible return
+    //! unique tokens identifying the category.  (If this is not possible return
     //! false.)
-    bool createNoUniqueTokenSearch(int type,
+    bool createNoUniqueTokenSearch(int categoryId,
                                    const std::string& example,
                                    size_t maxMatchingStringLen,
                                    std::string& part1,
@@ -55,27 +55,25 @@ class MODEL_EXPORT CTokenListReverseSearchCreator : public CTokenListReverseSear
     //! Initialise the two strings that form a reverse search.  For example,
     //! this could be as simple as clearing the strings or setting them to
     //! some sort of one-off preamble.
-    void initStandardSearch(int type,
+    void initStandardSearch(int categoryId,
                             const std::string& example,
                             size_t maxMatchingStringLen,
                             std::string& part1,
                             std::string& part2) const override;
 
-    //! Modify the two strings that form a reverse search to account for the
-    //! specified token, which may occur anywhere within the original
-    //! message, but has been determined to be a good thing to distinguish
-    //! this type of messages from other types.
-    void addCommonUniqueToken(const std::string& token,
-                              std::string& part1,
-                              std::string& part2) const override;
-
     //! Modify the two strings that form a reverse search to account for the
     //! specified token.
     void addInOrderCommonToken(const std::string& token,
                                bool first,
                                std::string& part1,
                                std::string& part2) const override;
 
+    //! Modify the two strings that form a reverse search to account for the
+    //! specified token, which may occur anywhere within the original
+    //! message, but has been determined to be a good thing to distinguish
+    //! this category of messages from other categories.
+    void addOutOfOrderCommonToken(const std::string&, std::string&, std::string&) const override;
+
     //! Close off the two strings that form a reverse search.  For example,
     //! this may be when closing brackets need to be appended.
     void closeStandardSearch(std::string& part1, std::string& part2) const override;

diff --git a/include/model/CTokenListReverseSearchCreatorIntf.h b/include/model/CTokenListReverseSearchCreatorIntf.h
@@ -71,21 +71,21 @@ class MODEL_EXPORT CTokenListReverseSearchCreatorIntf {
                                     std::string& part1,
                                     std::string& part2) const = 0;
 
-    //! Modify the two strings that form a reverse search to account for the
-    //! specified token, which may occur anywhere within the original
-    //! message, but has been determined to be a good thing to distinguish
-    //! this category of messages from other categories.
-    virtual void addCommonUniqueToken(const std::string& token,
-                                      std::string& part1,
-                                      std::string& part2) const = 0;
-
     //! Modify the two strings that form a reverse search to account for the
     //! specified token.
     virtual void addInOrderCommonToken(const std::string& token,
                                        bool first,
                                        std::string& part1,
                                        std::string& part2) const = 0;
 
+    //! Modify the two strings that form a reverse search to account for the
+    //! specified token, which may occur anywhere within the original
+    //! message, but has been determined to be a good thing to distinguish
+    //! this category of messages from other categories.
+    virtual void addOutOfOrderCommonToken(const std::string& token,
+                                          std::string& part1,
+                                          std::string& part2) const = 0;
+
     //! Close off the two strings that form a reverse search.  For example,
     //! this may be when closing brackets need to be appended.
     virtual void closeStandardSearch(std::string& part1, std::string& part2) const;

diff --git a/lib/api/unittest/CFieldDataCategorizerTest.cc b/lib/api/unittest/CFieldDataCategorizerTest.cc
@@ -222,6 +222,53 @@ BOOST_AUTO_TEST_CASE(testNodeReverseSearch) {
     BOOST_TEST_REQUIRE(output.find("\"message\"") == std::string::npos);
 }
 
+BOOST_AUTO_TEST_CASE(testJobKilledReverseSearch) {
+    model::CLimits limits;
+    CFieldConfig config;
+    BOOST_TEST_REQUIRE(config.initFromFile("testfiles/new_persist_categorization.conf"));
+
+    std::ostringstream outputStrm;
+    {
+        CNullOutput nullOutput;
+        core::CJsonOutputStreamWrapper wrappedOutputStream(outputStrm);
+        CJsonOutputWriter writer("job", wrappedOutputStream);
+
+        CFieldDataCategorizer categorizer("job", config, limits, nullOutput, writer);
+
+        CFieldDataCategorizer::TStrStrUMap dataRowFields;
+        dataRowFields["message"] = "[count_tweets] Killing job";
+
+        BOOST_TEST_REQUIRE(categorizer.handleRecord(dataRowFields));
+
+        dataRowFields["message"] = "Killing job [count_tweets]";
+
+        BOOST_TEST_REQUIRE(categorizer.handleRecord(dataRowFields));
+
+        dataRowFields["message"] = "[tweets_by_location] Killing job";
+
+        BOOST_TEST_REQUIRE(categorizer.handleRecord(dataRowFields));
+
+        dataRowFields["message"] = "Killing job [tweets_by_location]";
+
+        BOOST_TEST_REQUIRE(categorizer.handleRecord(dataRowFields));
+
+        categorizer.finalise();
+    }
+
+    const std::string& output = outputStrm.str();
+    LOG_DEBUG(<< "Output is: " << output);
+
+    // Assert that the reverse search contains all expected tokens when
+    // categorization is run end-to-end (obviously computation of categories and
+    // reverse search creation are tested more thoroughly in the unit tests for
+    // their respective classes, but this test helps to confirm that they work
+    // together)
+    BOOST_TEST_REQUIRE(output.find("\"terms\":\"Killing job\"") != std::string::npos);
+    BOOST_TEST_REQUIRE(output.find("\"regex\":\".*\"") != std::string::npos);
+    // The input data should NOT be in the output
+    BOOST_TEST_REQUIRE(output.find("\"message\"") == std::string::npos);
+}
+
 BOOST_AUTO_TEST_CASE(testPassOnControlMessages) {
     model::CLimits limits;
     CFieldConfig config;

diff --git a/lib/model/CTokenListDataCategorizerBase.cc b/lib/model/CTokenListDataCategorizerBase.cc
@@ -297,20 +297,20 @@ bool CTokenListDataCategorizerBase::createReverseSearch(int categoryId,
                                                category.maxMatchingStringLen(),
                                                part1, part2);
 
-    for (auto costedCommonUniqueTokenId : costedCommonUniqueTokenIds) {
-        m_ReverseSearchCreator->addCommonUniqueToken(
-            m_TokenIdLookup[costedCommonUniqueTokenId].str(), part1, part2);
-    }
-
-    bool first(true);
-    size_t end(category.outOfOrderCommonTokenIndex());
-    for (size_t index = 0; index < end; ++index) {
-        size_t tokenId(baseTokenIds[index].first);
+    bool firstInOrderToken{true};
+    std::size_t endOfOrdered{category.outOfOrderCommonTokenIndex()};
+    for (std::size_t index = 0; index < baseTokenIds.size(); ++index) {
+        std::size_t tokenId(baseTokenIds[index].first);
         if (costedCommonUniqueTokenIds.find(tokenId) !=
             costedCommonUniqueTokenIds.end()) {
-            m_ReverseSearchCreator->addInOrderCommonToken(
-                m_TokenIdLookup[tokenId].str(), first, part1, part2);
-            first = false;
+            if (index < endOfOrdered) {
+                m_ReverseSearchCreator->addInOrderCommonToken(
+                    m_TokenIdLookup[tokenId].str(), firstInOrderToken, part1, part2);
+                firstInOrderToken = false;
+            } else {
+                m_ReverseSearchCreator->addOutOfOrderCommonToken(
+                    m_TokenIdLookup[tokenId].str(), part1, part2);
+            }
         }
     }
 

diff --git a/lib/model/CTokenListReverseSearchCreator.cc b/lib/model/CTokenListReverseSearchCreator.cc
@@ -56,11 +56,6 @@ void CTokenListReverseSearchCreator::initStandardSearch(int /*categoryId*/,
     part2.clear();
 }
 
-void CTokenListReverseSearchCreator::addCommonUniqueToken(const std::string& /*token*/,
-                                                          std::string& /*part1*/,
-                                                          std::string& /*part2*/) const {
-}
-
 void CTokenListReverseSearchCreator::addInOrderCommonToken(const std::string& token,
                                                            bool first,
                                                            std::string& part1,
@@ -75,6 +70,15 @@ void CTokenListReverseSearchCreator::addInOrderCommonToken(const std::string& to
     part2 += core::CRegex::escapeRegexSpecial(token);
 }
 
+void CTokenListReverseSearchCreator::addOutOfOrderCommonToken(const std::string& token,
+                                                              std::string& part1,
+                                                              std::string& /*part2*/) const {
+    if (part1.empty() == false) {
+        part1 += ' ';
+    }
+    part1 += token;
+}
+
 void CTokenListReverseSearchCreator::closeStandardSearch(std::string& /*part1*/,
                                                          std::string& part2) const {
     part2 += ".*";

diff --git a/lib/model/unittest/CTokenListReverseSearchCreatorTest.cc b/lib/model/unittest/CTokenListReverseSearchCreatorTest.cc
@@ -57,19 +57,6 @@ BOOST_AUTO_TEST_CASE(testInitStandardSearch) {
     BOOST_REQUIRE_EQUAL(std::string(""), reverseSearchPart2);
 }
 
-BOOST_AUTO_TEST_CASE(testAddCommonUniqueToken) {
-    CTokenListReverseSearchCreator reverseSearchCreator("foo");
-
-    std::string reverseSearchPart1;
-    std::string reverseSearchPart2;
-
-    reverseSearchCreator.addCommonUniqueToken("user", reverseSearchPart1, reverseSearchPart2);
-    reverseSearchCreator.addCommonUniqueToken("logged", reverseSearchPart1, reverseSearchPart2);
-
-    BOOST_REQUIRE_EQUAL(std::string(""), reverseSearchPart1);
-    BOOST_REQUIRE_EQUAL(std::string(""), reverseSearchPart2);
-}
-
 BOOST_AUTO_TEST_CASE(testAddInOrderCommonToken) {
     CTokenListReverseSearchCreator reverseSearchCreator("foo");
 
@@ -90,6 +77,20 @@ BOOST_AUTO_TEST_CASE(testAddInOrderCommonToken) {
                         reverseSearchPart2);
 }
 
+BOOST_AUTO_TEST_CASE(testAddOutOfOrderCommonToken) {
+    CTokenListReverseSearchCreator reverseSearchCreator("foo");
+
+    std::string reverseSearchPart1;
+    std::string reverseSearchPart2;
+
+    reverseSearchCreator.addOutOfOrderCommonToken("user", reverseSearchPart1, reverseSearchPart2);
+    reverseSearchCreator.addOutOfOrderCommonToken("logged", reverseSearchPart1,
+                                                  reverseSearchPart2);
+
+    BOOST_REQUIRE_EQUAL(std::string("user logged"), reverseSearchPart1);
+    BOOST_REQUIRE_EQUAL(std::string(""), reverseSearchPart2);
+}
+
 BOOST_AUTO_TEST_CASE(testCloseStandardSearch) {
     CTokenListReverseSearchCreator reverseSearchCreator("foo");