From f887c2f64d4562135b69fff83ea88a98ff6022c6 Mon Sep 17 00:00:00 2001 From: Mark Stemm Date: Mon, 26 Sep 2016 18:32:14 -0700 Subject: [PATCH] Path prefix search (#660) * Add ability to test a path against many prefixes Add a data structure that allows testing a path aka /var/log/messages against a set of path prefixes (/usr, /bin, /var/log, ...). Thinking ahead to use in libsinsp, the search structure uses pointer + length pairs instead of strings, meaning that it does not copy any data, only refers to it. This way the structure doesn't copy filtercheck values. In order to share the idea of a pair of pointer + length between this structure and the unordered_set used by "in (...)" set membership tests, move the hashing and equality function from filterchecks to a standalone header filter_value.h and use it for the unordered_map. The paths are held in a tree-like structure. At each level, an unordered map has the path components for that level and a sub-tree of paths for that root. The tree can change if new search paths are prefixes of any of the current paths (the sub-tree for the longer path is replaced by the prefix). If a new search path is a suffix of any existing path no change is made, as it is already covered by the prefix. Matching involves splitting off the first directory component and testing it against the values at that level. If a match is found, it recursively calls match for the subtree. * Add pmatch operator, using prefix_search struct. Remove the definitions of filter_value_member_t/hash function/equality function from filterchecks.h, renaming it to filter_value_t along the way. Add an operator CO_PMATCH/"pmatch" which takes a set of values like CO_IN does, but requires that the left hand side of the comparison is a PT_CHARBUF. When filter values are added, they are added to the new path_prefix_search object m_val_storages_paths. in ::flt_compare, when the operator is CO_PMATCH, test the value against m_val_storages_paths. As a result, you can run sysdig using a command line like: sudo ./userspace/sysdig/sysdig "evt.type=open and fd.directory pmatch (/var, /usr)" and see all the file opens for files below either /var or /usr. --- userspace/libsinsp/CMakeLists.txt | 1 + userspace/libsinsp/filter.cpp | 55 +++++-- userspace/libsinsp/filter.h | 3 +- userspace/libsinsp/filter_value.h | 60 ++++++++ userspace/libsinsp/filterchecks.h | 44 +----- userspace/libsinsp/lua_parser_api.cpp | 6 +- userspace/libsinsp/prefix_search.cpp | 197 ++++++++++++++++++++++++++ userspace/libsinsp/prefix_search.h | 81 +++++++++++ 8 files changed, 392 insertions(+), 55 deletions(-) create mode 100644 userspace/libsinsp/filter_value.h create mode 100644 userspace/libsinsp/prefix_search.cpp create mode 100644 userspace/libsinsp/prefix_search.h diff --git a/userspace/libsinsp/CMakeLists.txt b/userspace/libsinsp/CMakeLists.txt index 5e782658fd..9530922f6b 100644 --- a/userspace/libsinsp/CMakeLists.txt +++ b/userspace/libsinsp/CMakeLists.txt @@ -65,6 +65,7 @@ add_library(sinsp STATIC "${JSONCPP_LIB_SRC}" logger.cpp parsers.cpp + prefix_search.cpp protodecoder.cpp threadinfo.cpp sinsp.cpp diff --git a/userspace/libsinsp/filter.cpp b/userspace/libsinsp/filter.cpp index 457a1a7092..5644216a86 100644 --- a/userspace/libsinsp/filter.cpp +++ b/userspace/libsinsp/filter.cpp @@ -1024,7 +1024,7 @@ void sinsp_filter_check::add_filter_value(const char* str, uint32_t len, uint16_ // XXX/mstemm this doesn't work if someone called // add_filter_value more than once for a given index. - filter_value_member_t item(filter_value_p(i), len); + filter_value_t item(filter_value_p(i), len); m_val_storages_members.insert(item); if(len < m_val_storages_min_size) @@ -1036,8 +1036,13 @@ void sinsp_filter_check::add_filter_value(const char* str, uint32_t len, uint16_ { m_val_storages_max_size = len; } -} + // If the operator is CO_PMATCH, also add the value to the paths set. + if (m_cmpop == CO_PMATCH) + { + m_val_storages_paths.add_search_path(item); + } +} void sinsp_filter_check::parse_filter_value(const char* str, uint32_t len, uint8_t *storage, uint32_t storage_len) { @@ -1066,7 +1071,7 @@ const filtercheck_field_info* sinsp_filter_check::get_field_info() bool sinsp_filter_check::flt_compare(cmpop op, ppm_param_type type, void* operand1, uint32_t op1_len, uint32_t op2_len) { - if (op == CO_IN) + if (op == CO_IN || op == CO_PMATCH) { // For raw strings, the length may not be set. So we do a strlen to find it. if(type == PT_CHARBUF && op1_len == 0) @@ -1074,13 +1079,25 @@ bool sinsp_filter_check::flt_compare(cmpop op, ppm_param_type type, void* operan op1_len = strlen((char *) operand1); } - filter_value_member_t item((uint8_t *) operand1, op1_len); - if(op1_len >= m_val_storages_min_size && - op1_len <= m_val_storages_max_size && - m_val_storages_members.find(item) != m_val_storages_members.end()) + filter_value_t item((uint8_t *) operand1, op1_len); + + if (op == CO_IN) { - return true; + if(op1_len >= m_val_storages_min_size && + op1_len <= m_val_storages_max_size && + m_val_storages_members.find(item) != m_val_storages_members.end()) + { + return true; + } + } + else + { + if (m_val_storages_paths.match(item)) + { + return true; + } } + return false; } else @@ -1355,7 +1372,7 @@ char sinsp_filter_compiler::next() } } -vector sinsp_filter_compiler::next_operand(bool expecting_first_operand, bool in_clause) +vector sinsp_filter_compiler::next_operand(bool expecting_first_operand, bool in_or_pmatch_clause) { vector res; bool is_quoted = false; @@ -1406,7 +1423,7 @@ vector sinsp_filter_compiler::next_operand(bool expecting_first_operand, b } else { - is_end_of_word = (!is_quoted && (isblank(curchar) || is_bracket(curchar) || (in_clause && curchar == ','))) || + is_end_of_word = (!is_quoted && (isblank(curchar) || is_bracket(curchar) || (in_or_pmatch_clause && curchar == ','))) || (is_quoted && escape_state != PES_SLASH && (curchar == '"' || curchar == '\'')); } @@ -1423,7 +1440,7 @@ vector sinsp_filter_compiler::next_operand(bool expecting_first_operand, b // ASSERT(m_scanpos >= start); - if(curchar == '(' || curchar == ')' || (in_clause && curchar == ',')) + if(curchar == '(' || curchar == ')' || (in_or_pmatch_clause && curchar == ',')) { m_scanpos--; } @@ -1604,6 +1621,11 @@ cmpop sinsp_filter_compiler::next_comparison_operator() m_scanpos += 2; return CO_IN; } + else if(compare_no_consume("pmatch")) + { + m_scanpos += 6; + return CO_PMATCH; + } else if(compare_no_consume("exists")) { m_scanpos += 6; @@ -1644,7 +1666,7 @@ void sinsp_filter_compiler::parse_check() chk->parse_field_name((char *)&operand1[0], true); - if(co == CO_IN) + if(co == CO_IN || co == CO_PMATCH) { // // Skip spaces @@ -1656,7 +1678,7 @@ void sinsp_filter_compiler::parse_check() if(m_fltstr[m_scanpos] != '(') { - throw sinsp_exception("expected '(' after 'in' operand"); + throw sinsp_exception("expected '(' after 'in/pmatch' operand"); } // @@ -1695,11 +1717,16 @@ void sinsp_filter_compiler::parse_check() } else { - throw sinsp_exception("expected either ')' or ',' after a value inside the 'in' clause"); + throw sinsp_exception("expected either ')' or ',' after a value inside the 'in/pmatch' clause"); } } m_filter->add_check(chk); } + else if (co == CO_PMATCH) + { + // the pmatch operator can only work on charbufs + throw sinsp_exception("pmatch requires all charbuf arguments"); + } else { // diff --git a/userspace/libsinsp/filter.h b/userspace/libsinsp/filter.h index 9bc5de7822..ad38d84a61 100644 --- a/userspace/libsinsp/filter.h +++ b/userspace/libsinsp/filter.h @@ -39,7 +39,8 @@ enum cmpop { CO_EXISTS = 9, CO_ICONTAINS = 10, CO_STARTSWITH = 11, - CO_GLOB = 12 + CO_GLOB = 12, + CO_PMATCH = 13 }; enum boolop diff --git a/userspace/libsinsp/filter_value.h b/userspace/libsinsp/filter_value.h new file mode 100644 index 0000000000..f718ca867b --- /dev/null +++ b/userspace/libsinsp/filter_value.h @@ -0,0 +1,60 @@ +/* +Copyright (C) 2013-2016 Draios inc. + +This file is part of sysdig. + +sysdig is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License version 2 as +published by the Free Software Foundation. + +sysdig is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with sysdig. If not, see . +*/ + +#pragma once + +#include +#include + +// Used for CO_IN/CO_PMATCH filterchecks using PT_CHARBUFs to allow +// for quick multi-value comparisons. Should also work for any +// filtercheck with a buffer and length. When compiling with gnu +// compilers, use the built in but not standard _hash_impl::hash +// function, which uses murmurhash2 and is quite fast. Otherwise, uses +// http://www.cse.yorku.ca/~oz/hash.html. + +typedef std::pair filter_value_t; + +struct g_hash_membuf +{ + size_t operator()(filter_value_t val) const + { +#ifdef __GNUC__ + return std::_Hash_impl::hash(val.first, val.second); +#else + size_t hash = 5381; + for(uint8_t *p = val.first; p-val.first < val.second; p++) + { + int c = *p; + + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + } + return hash; +#endif + } +}; + +struct g_equal_to_membuf +{ + bool operator()(filter_value_t a, filter_value_t b) const + { + return (a.second == b.second && + memcmp(a.first, b.first, a.second) == 0); + } +}; + diff --git a/userspace/libsinsp/filterchecks.h b/userspace/libsinsp/filterchecks.h index 2f295e0e7d..6265f39a49 100644 --- a/userspace/libsinsp/filterchecks.h +++ b/userspace/libsinsp/filterchecks.h @@ -19,6 +19,8 @@ along with sysdig. If not, see . #pragma once #include #include +#include "filter_value.h" +#include "prefix_search.h" #include "k8s.h" #include "mesos.h" @@ -43,44 +45,6 @@ class operand_info string m_description; }; -// Used for CO_IN filterchecks using PT_CHARBUFs to allow for quick -// multi-value comparisons. Should also work for any filtercheck with -// a buffer and length. When compiling with gnu compilers, use the -// built in but not standard _hash_impl::hash function, which uses -// murmurhash2 and is quite fast. Otherwise, uses -// http://www.cse.yorku.ca/~oz/hash.html. - -// Used by m_val_storages_members -typedef pair filter_value_member_t; - -struct g_hash_membuf -{ - size_t operator()(filter_value_member_t val) const - { -#ifdef __GNUC__ - return std::_Hash_impl::hash(val.first, val.second); -#else - size_t hash = 5381; - for(uint8_t *p = val.first; p-val.first < val.second; p++) - { - int c = *p; - - hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ - } - return hash; -#endif - } -}; - -struct g_equal_to_membuf -{ - bool operator()(filter_value_member_t a, filter_value_member_t b) const - { - return (a.second == b.second && - memcmp(a.first, b.first, a.second) == 0); - } -}; - /////////////////////////////////////////////////////////////////////////////// // The filter check interface // NOTE: in order to add a new type of filter check, you need to add a class for @@ -189,10 +153,12 @@ class sinsp_filter_check inline uint8_t* filter_value_p(uint16_t i = 0) { return &m_val_storages[i][0]; } inline vector filter_value(uint16_t i = 0) { return m_val_storages[i]; } - unordered_set m_val_storages_members; + path_prefix_search m_val_storages_paths; + uint32_t m_val_storages_min_size; uint32_t m_val_storages_max_size; diff --git a/userspace/libsinsp/lua_parser_api.cpp b/userspace/libsinsp/lua_parser_api.cpp index 55a0e82eac..8975c53d41 100644 --- a/userspace/libsinsp/lua_parser_api.cpp +++ b/userspace/libsinsp/lua_parser_api.cpp @@ -54,6 +54,10 @@ cmpop string_to_cmpop(const char* str) { return CO_IN; } + else if(strcmp(str, "pmatch") == 0) + { + return CO_PMATCH; + } else if(strcmp(str, "exists") == 0) { return CO_EXISTS; @@ -216,7 +220,7 @@ int lua_parser_cbacks::rel_expr(lua_State *ls) // "exists" is the only unary comparison op if(strcmp(cmpop, "exists")) { - if (strcmp(cmpop, "in") == 0) + if (strcmp(cmpop, "in") == 0 || strcmp(cmpop, "pmatch") == 0) { if (!lua_istable(ls, 3)) { diff --git a/userspace/libsinsp/prefix_search.cpp b/userspace/libsinsp/prefix_search.cpp new file mode 100644 index 0000000000..27149b84b2 --- /dev/null +++ b/userspace/libsinsp/prefix_search.cpp @@ -0,0 +1,197 @@ +/* +Copyright (C) 2013-2016 Draios inc. + +This file is part of sysdig. + +sysdig is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License version 2 as +published by the Free Software Foundation. + +sysdig is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with sysdig. If not, see . +*/ + +#include +#include + +#include "prefix_search.h" + +using namespace std; + +path_prefix_search::path_prefix_search() +{ +} + +path_prefix_search::~path_prefix_search() +{ + for (auto &ent : m_dirs) + { + delete(ent.second); + } +} + +// Split path /var/log/messages into dirent (var) and remainder (/log/messages) +void path_prefix_search::split_path(const filter_value_t &path, filter_value_t &dirent, filter_value_t &remainder) +{ + uint32_t length = path.second; + + if(path.second == 0) + { + // The result of splitting an empty string is 2 empty strings + return; + } + + // Skip any trailing /, not needed + if (path.first[path.second-1] == '/') + { + length--; + } + + uint32_t start = 0; + + // Also skip any leading '/', not needed. + if(path.first[0] == '/') + { + start++; + } + + void *pos = memmem(path.first+start, path.second, "/", 1); + if(pos == NULL || pos >= (path.first + length)) + { + dirent.first = path.first + start; + dirent.second = length-start; + } + else + { + dirent.first = path.first + start; + dirent.second = (uint8_t *) pos-dirent.first; + + remainder.first = (uint8_t *) pos; + remainder.second = length-dirent.second-start; + } +} + +// NOTE: this does not copy, so it is only valid as long as path is valid. +void path_prefix_search::add_search_path(const char *path) +{ + filter_value_t mem((uint8_t *) path, (uint32_t) strlen(path)); + return add_search_path(mem); +} + +void path_prefix_search::add_search_path(const filter_value_t &path) +{ + filter_value_t dirent, remainder; + path_prefix_search *subtree = NULL; + path_prefix_search::split_path(path, dirent, remainder); + auto it = m_dirs.find(dirent); + + if(it == m_dirs.end()) + { + // This path component doesn't match any existing + // dirent. We need to add one and its subtree. + if(remainder.second > 0) + { + subtree = new path_prefix_search(); + subtree->add_search_path(remainder); + } + + m_dirs[dirent] = subtree; + } + else + { + // An entry for this dirent already exists. We will + // either add a new entry to the subtree, do nothing, + // or get rid of the existing subtree. + if(remainder.second == 0) + { + // This path is a prefix of the current path and we + // can drop the existing subtree. For example, we can + // drop /usr/lib when adding /usr. + delete(it->second); + m_dirs.erase(dirent); + m_dirs[dirent] = NULL; + } + else if(it->second == NULL) + { + // The existing path is shorter than the + // current path, in which case we don't have + // to do anything. For example, no need to add + // /usr/lib when /usr exists. + } + else + { + // We need to add the remainder to the + // sub-tree's search path. + it->second->add_search_path(remainder); + } + } +} + +// NOTE: this does not copy, so it is only valid as long as path is valid. +bool path_prefix_search::match(const char *path) +{ + filter_value_t mem((uint8_t *) path, (uint32_t) strlen(path)); + return match(mem); +} + +bool path_prefix_search::match(const filter_value_t &path) +{ + filter_value_t dirent, remainder; + path_prefix_search::split_path(path, dirent, remainder); + auto it = m_dirs.find(dirent); + + if(it == m_dirs.end()) + { + return false; + } + else + { + // If there is nothing left in the match path, the + // subtree must be null. This ensures that /var + // matches only /var and not /var/lib + if(remainder.second == 0) + { + return (it->second == NULL); + } + else if(it->second == NULL) + { + // /foo/bar matched a prefix /foo, so we're + // done. + return true; + } + else + { + return it->second->match(remainder); + } + } +} + +std::string path_prefix_search::as_string() +{ + return as_string(string("")); +} + +// Unlike all the other methods, this does perform copies. +std::string path_prefix_search::as_string(const std::string &prefix) +{ + std::ostringstream os; + + for (auto &it : m_dirs) + { + string dirent((const char *) it.first.first, it.first.second); + os << prefix << dirent << " -> " << endl; + if(it.second) + { + std::string indent = prefix; + indent += " "; + os << it.second->as_string(indent); + } + } + + return os.str(); +} diff --git a/userspace/libsinsp/prefix_search.h b/userspace/libsinsp/prefix_search.h new file mode 100644 index 0000000000..608012688f --- /dev/null +++ b/userspace/libsinsp/prefix_search.h @@ -0,0 +1,81 @@ +/* +Copyright (C) 2013-2016 Draios inc. + +This file is part of sysdig. + +sysdig is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License version 2 as +published by the Free Software Foundation. + +sysdig is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with sysdig. If not, see . +*/ + +#pragma once + +#include +#include + +#include "filter_value.h" + +// +// A data structure that allows testing a path P against a set of +// search paths S. The search succeeds if any of the search paths Si +// is a prefix of the path P. +// +// Here are some examples: +// - search(/var/run/docker, [/var/run, /etc, /lib, /usr/lib]) +// succeeds because /var/run is a prefix of /var/run/docker. +// - search(/boot, [/var/run, /etc, /lib, /usr/lib]) +// does not succeed because no path is a prefix of /boot. +// - search(/var/lib/messages, [/var/run, /etc, /lib, /usr/lib]) +// does not succeed because no path is a prefix of /var/lib/messages. +// /var is a partial match but not /var/run. +// - search(/var, [/var/run, /etc, /lib, /usr/lib]) +// does not succeed because no path is a prefix of /var +// /var is a partial match but the search path is /var/run, not /var. + +class path_prefix_search +{ +public: + path_prefix_search(); + ~path_prefix_search(); + + void add_search_path(const char *path); + void add_search_path(const filter_value_t &path); + + bool match(const char *path); + bool match(const filter_value_t &path); + + std::string as_string(); + +private: + + std::string as_string(const std::string &prefix); + + static void split_path(const filter_value_t &path, filter_value_t &dirent, filter_value_t &remainder); + + // Maps from the path component at the current level to a + // prefix search for the sub-path below the current level. + // For example, if the set of search paths is (/var/run, /etc, + // /lib, /usr, /usr/lib, /var/lib, /var/run), m_dirs contains: + // - (var, path_prefix_search(/run) + // - (etc, NULL) + // - (lib, NULL) + // - (usr, NULL) + // - (var, path_prefix_search(/lib, /run) + // Note that because usr is a prefix of /usr/lib, the /usr/lib + // path is dropped and only /usr is kept. Also note that + // terminator paths have a NULL path_prefix_search object. + std::unordered_map m_dirs; +}; + +