Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix BinaryEdit::getResolvedLibraryPath for Ubuntu 22.04 #1362

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
223 changes: 139 additions & 84 deletions dyninstAPI/src/linux.C
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@
#include "linux.h"
#include <dlfcn.h>

#include <exception>
#include <sstream>
#include <string>
#include <utility>
#include <vector>

#include "boost/shared_ptr.hpp"

#include "pcEventMuxer.h"
Expand Down Expand Up @@ -174,118 +180,167 @@ bool PCEventMuxer::useCallback(Dyninst::ProcControlAPI::EventType et)
return false;
}

bool BinaryEdit::getResolvedLibraryPath(const string &filename, std::vector<string> &paths) {
char *libPathStr, *libPath;
std::vector<string> libPaths;
struct stat dummy;
char buffer[512];
char *pos, *key, *val;
namespace
{
template <typename ContainerT = std::vector<std::string>,
typename PredicateT = std::string (*)(const std::string&)>
inline ContainerT
delimit(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For as much as I hate in/out parameters, I'd rather replace this with

std::vector<std::string> vals;
boost::split(vals, line, boost::token_compress_on);

just to have one less hand-rolled solution in this codebase.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes perfect sense. I've actually had this fix for a while but hadn't submitted it bc I figured that would be the case and hadn't looked around for any existing solution.

const std::string& line, const std::string& delimiters = ":",
PredicateT&& predicate = [](const std::string& s) -> std::string { return s; })
{
size_t _beginp = 0; // position that is the beginning of the new string
size_t _delimp = 0; // position of the delimiter in the string
ContainerT _result = {};
while(_beginp < line.length() && _delimp < line.length())
{
// find the first character (starting at _delimp) that is not a delimiter
_beginp = line.find_first_not_of(delimiters, _delimp);
// if no a character after or at _end that is not a delimiter is not found
// then we are done
if(_beginp == std::string::npos)
break;
// starting at the position of the new string, find the next delimiter
_delimp = line.find_first_of(delimiters, _beginp);
std::string _tmp{};
try
{
// starting at the position of the new string, get the characters
// between this position and the next delimiter
if(_beginp < line.length())
_tmp = line.substr(_beginp, _delimp - _beginp);
} catch(std::exception& e)
{
// print the exception but don't fail, unless maybe it should?
fprintf(stderr, "[%s:%i] %s (delimiters: %s) :: %s\n", __FILE__, __LINE__,
line.c_str(), delimiters.c_str(), e.what());
}
// don't add empty strings
if(!_tmp.empty())
{
_result.emplace_back(std::forward<PredicateT>(predicate)(_tmp));
}
}
return _result;
}
} // namespace

bool
BinaryEdit::getResolvedLibraryPath(const string& filename, std::vector<string>& paths)
{
auto _path_exists = [](const std::string& _filename) {
struct stat dummy;
return (_filename.empty()) ? false : (stat(_filename.c_str(), &dummy) == 0);
};

auto _emplace_if_exists = [&paths, filename,
_path_exists](const std::string& _directory) {
auto _filename = _directory + "/" + filename;
if(_path_exists(_filename))
paths.emplace_back(std::move(_filename));
};

// prefer qualified file paths
if (stat(filename.c_str(), &dummy) == 0) {
paths.push_back(filename);
}
if(_path_exists(filename))
paths.emplace_back(filename);

// For cross-rewriting
char *dyn_path = getenv("DYNINST_REWRITER_PATHS");
if (dyn_path) {
libPathStr = strdup(dyn_path);
libPath = strtok(libPathStr, ":");
while (libPath != NULL) {
libPaths.push_back(string(libPath));
libPath = strtok(NULL, ":");
}
free(libPathStr);
char* dyn_path = getenv("DYNINST_REWRITER_PATHS");
if(dyn_path)
{
for(const auto& itr : delimit(dyn_path, ":"))
_emplace_if_exists(itr);
}

// search paths from environment variables
char *ld_path = getenv("LD_LIBRARY_PATH");
if (ld_path) {
libPathStr = strdup(ld_path);
libPath = strtok(libPathStr, ":");
while (libPath != NULL) {
libPaths.push_back(string(libPath));
libPath = strtok(NULL, ":");
}
free(libPathStr);
char* ld_path = getenv("LD_LIBRARY_PATH");
if(ld_path)
{
for(const auto& itr : delimit(ld_path, ":"))
_emplace_if_exists(itr);
}

#ifdef DYNINST_COMPILER_SEARCH_DIRS
// search compiler-specific library paths
// NB1: DYNINST_COMPILER_SEARCH_DIRS is defined at build time
// NB2: This is explicitly done _after_ adding directories from
// LD_LIBRARY_PATH so that the user can override these paths.
{
#define xstr(s) str(s)
#define str(s) #s

libPathStr = strdup(xstr(DYNINST_COMPILER_SEARCH_DIRS));
libPath = strtok(libPathStr, ":");
while (libPath != NULL) {
libPaths.emplace_back(libPath);
libPath = strtok(NULL, ":");
}
free(libPathStr);

#undef str
#undef xstr
}
#endif
//libPaths.push_back(string(getenv("PWD")));
for (unsigned int i = 0; i < libPaths.size(); i++) {
string str = libPaths[i] + "/" + filename;
if (stat(str.c_str(), &dummy) == 0) {
paths.push_back(str);
}
{
# define xstr(s) str(s)
# define str(s) # s

for(const auto& itr : delimit(xstr(DYNINST_COMPILER_SEARCH_DIRS), ":"))
_emplace_if_exists(itr);

# undef str
# undef xstr
}
#endif

// search ld.so.cache
// apparently ubuntu doesn't like pclosing NULL, so a shared pointer custom
// destructor is out. Ugh.
FILE* ldconfig = popen("/sbin/ldconfig -p", "r");
if (ldconfig) {
if(!fgets(buffer, 512, ldconfig)) { // ignore first line
return false;
}
while (fgets(buffer, 512, ldconfig) != NULL) {
pos = buffer;
while (*pos == ' ' || *pos == '\t') pos++;
key = pos;
while (*pos != ' ') pos++;
*pos = '\0';
while (*pos != '=' && *(pos + 1) != '>') pos++;
pos += 2;
while (*pos == ' ' || *pos == '\t') pos++;
val = pos;
while (*pos != '\n' && *pos != '\0') pos++;
*pos = '\0';
if (strcmp(key, filename.c_str()) == 0) {
paths.push_back(val);
if(ldconfig)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kupsch @bigtrak

This is just terrible, and I'm honestly surprised it took this long to get a nasty bug here. I vote we replace this and the manual directory list lookup below by just using

if(void *h = dlopen(filename.c_str(), RTLD_LAZY)) {
  char origin[PATH_MAX];
  if(dlinfo(h, RTLD_DI_ORIGIN, name) == 0) {
    _emplace_if_exists(origin);  // check is useless, but uniform
  }
}

The only caveat here is that

If the calling object (i.e., the shared library or executable from which dlopen() is called) contains a DT_RPATH tag, and does not contain a DT_RUNPATH tag, then the directories listed in the DT_RPATH tag are searched.

If we want to work around this, we could just continue to do the manual search of LD_LIBRARY_PATH first to let users override.

Begrudgingly, the DYNINST_COMPILER_SEARCH_DIRS has to stay because ldconfig doesn't always have the compiler-specific search directories available. At least on my system, dlopen wouldn't find libgcc_s.so which lives in /usr/lib/gcc/x86_64-linux-gnu/12/libgcc_s.so.

With this, we'd have the heuristic:

  1. Use the provided filename if file exists (via stat rules)
  2. Search DYNINST_REWRITER_PATHS
  3. Search LD_LIBRARY_PATH
  4. Use ld lookup rules via dlopen.

man dlopen for reference

o (ELF only) If the calling object (i.e., the shared library or executable from which dlopen() is called) contains a DT_RPATH tag, and does not contain a DT_RUNPATH tag, then the directories listed in the DT_RPATH tag are searched.

o If, at the time that the program was started, the environment variable LD_LIBRARY_PATH was defined to contain a colon-separated list of directories, then these are searched. (As a security measure, this variable is ignored for set-user-ID and set-group-ID programs.)

o (ELF only) If the calling object contains a DT_RUNPATH tag, then the directories listed in that tag are searched.

o The cache file /etc/ld.so.cache (maintained by ldconfig(8)) is checked to see whether it contains an entry for filename.

o The directories /lib and /usr/lib are searched (in that order).

Copy link
Contributor Author

@jrmadsen jrmadsen Feb 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You want me to proceed with this? I'll probably first use with the RTLD_NOLOAD to make sure it's not already open bc I'm concerned whether there is a guarantee that loading the library will not have any unintended side effects outside of just finding the library, e.g., cause Dyninst to start parsing that library.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We had an internal discussion about this yesterday. The confounding factors are the RTLD_NOLOAD won't let you open a file unless it's already open. This means the only way to inspect RTLD_DI_ORIGIN is to actually load the library into Dyninst's address space. Aside from the security implications, it's just an inefficient way of doing it. We're pretty much stuck to parsing the output of ldconfig.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about NOLOAD on the library containing this function, grabbing that DI_ORIGIN and searching that? In many cases, that could give us the path we want. We could also NOLOAD dlopen the executable, get the link path, and search the dirs of the libs it is linked to

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

@jrmadsen jrmadsen Feb 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I fully understand what NOLOAD does. My point is that since this function is being called, unless the function is statically linked, the library is loaded so it will return a handle. And the same goes for passing nullptr to dlopen (which returns the handle to the exe). As long as those are dynamic objects, you can get the handle via NOLOAD

Copy link
Contributor Author

@jrmadsen jrmadsen Feb 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this helps to clarify:

auto* _handle = dlopen("libdyninstAPI.so", RTLD_LAZY|RTLD_NOLOAD);
// ... get origin of handle ...
auto _lib_directory = dirname(...);

In other words, the RT library you want is extremely likely to be installed in the same directory as the dyninstAPI library

Copy link
Contributor

@hainest hainest Feb 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is supposed to resolve any library name, so we have to at least consider everything in ld.so.cache which is likely a larger set than the union of DYNINST_REWRITER_PATHS, LD_LIBRARY_PATH, DYNINST_COMPILER_SEARCH_DIRS, and $ORIGIN.

@kupsch @bigtrak
This code is really just parsing the basedirname of the files reported from the cache. Why don't we just read the cache directly and report the basedirname or, if we want to implement a parser, from /etc/ld.so.conf?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is supposed to resolve any library name

Ok well, then how about adding a vector hints parameter which gets searched after LD_LIBRARY_PATH for situations like finding the RT library. If you are concerned about the ABI, we can just implement one with the same signature that calls the new one with zero hints.

Copy link
Contributor

@hainest hainest Feb 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok well, then how about adding a vector hints parameter which gets searched after LD_LIBRARY_PATH for situations like finding the RT library.

What kinds of hints were you thinking? I think adding the "self" $ORIGIN would be good to do regardless. Refactoring this function to check for the file existence before proceeding on to the next lookup location would also be good. This would put off parsing ldconfig unless it's absolutely necessary and would make the $ORIGIN check an actual short circuit. Nevermind. I keep forgetting this function doesn't do what's printed on the tin. It returns all paths that contain the given file name. Shortcircuiting would break that behavior.

P.S. dlopen(nullptr) is guaranteed to give you the current binary whereas dlopen("libdyninstAPI.so") might not.

{
constexpr size_t buffer_size = 512;
char buffer[buffer_size];
// ignore first line
if(fgets(buffer, buffer_size, ldconfig))
{
// each line constaining relevant info should be in form:
// <LIBRARY_BASENAME> (...) => <RESOLVED_ABSOLUTE_PATH>
// example:
// libz.so (libc6,x86-64) => /lib/x86_64-linux-gnu/libz.so
auto _get_entry = [](const std::string& _inp) {
auto _paren_pos = _inp.find('(');
auto _arrow_pos = _inp.find("=>", _paren_pos);
if(_arrow_pos == std::string::npos || _paren_pos == std::string::npos)
return std::string{};
if(_arrow_pos + 2 < _inp.length())
{
auto _pos = _inp.find_first_not_of(" \t", _arrow_pos + 2);
if(_pos < _inp.length())
return _inp.substr(_pos);
}
return std::string{};
};

auto _data = std::stringstream{};
while(fgets(buffer, buffer_size, ldconfig) != nullptr)
{
_data << buffer;
auto _len = strnlen(buffer, buffer_size);
if(_len > 0 && buffer[_len - 1] == '\n')
{
auto _v = _data.str();
if(!_v.empty())
{
_v = _v.substr(_v.find_first_not_of(" \t"));
if(_v.length() > 1)
{
auto _entry = _get_entry(_v.substr(0, _v.length() - 1));
if(!_entry.empty())
_emplace_if_exists(_entry);
}
}
_data = std::stringstream{};
}
}
}
pclose(ldconfig);
}

// search hard-coded system paths
libPaths.clear();
libPaths.push_back("/usr/local/lib");
libPaths.push_back("/usr/share/lib");
libPaths.push_back("/usr/lib");
libPaths.push_back("/usr/lib64");
libPaths.push_back("/usr/lib/x86_64-linux-gnu");
libPaths.push_back("/lib");
libPaths.push_back("/lib64");
libPaths.push_back("/lib/x86_64-linux-gnu");
libPaths.push_back("/usr/lib/i386-linux-gnu");
libPaths.push_back("/usr/lib32");
for (unsigned int i = 0; i < libPaths.size(); i++) {
string str = libPaths[i] + "/" + filename;
if (stat(str.c_str(), &dummy) == 0) {
paths.push_back(str);
}
for(const char* itr :
{ "/usr/local/lib", "/usr/share/lib", "/usr/lib", "/usr/lib64",
"/usr/lib/x86_64-linux-gnu", "/lib", "/lib64", "/lib/x86_64-linux-gnu",
"/usr/lib/i386-linux-gnu", "/usr/lib32" })
{
_emplace_if_exists(itr);
}

return ( 0 < paths.size() );
return (!paths.empty());
}

bool BinaryEdit::archSpecificMultithreadCapable() {
Expand Down