diff --git a/agent/tool-scripts/meta.json b/agent/tool-scripts/meta.json new file mode 100644 index 0000000000..b9b265b4d1 --- /dev/null +++ b/agent/tool-scripts/meta.json @@ -0,0 +1,48 @@ +{ + "transient":{ + "blktrace": null, + "bpftrace": null, + "cpuacct": null, + "disk": null, + "dm-cache": null, + "docker": null, + "docker-info": null, + "external-data-source": null, + "haproxy-ocp": null, + "iostat": null, + "jmap": null, + "jstack": null, + "kvm-spinlock": null, + "kvmstat": null, + "kvmtrace": null, + "lockstat": null, + "mpstat": null, + "numastat": null, + "oc": null, + "openvswitch": null, + "pcp": null, + "perf": null, + "pidstat": null, + "pprof": null, + "proc-interrupts": null, + "proc-sched_debug": null, + "proc-vmstat": null, + "prometheus-metrics": null, + "qemu-migrate": null, + "rabbit": null, + "sar": null, + "strace": null, + "sysfs": null, + "systemtap": null, + "tcpdump": null, + "turbostat": null, + "user-tool": null, + "virsh-migrate": null, + "vmstat": null + }, + + "persistent":{ + "node-exporter": {"collector": "prometheus", "port": "9100"}, + "dcgm": {"collector": "prometheus", "port": "8000"} + } +} diff --git a/agent/util-scripts/gold/pbench-register-tool/test-44.txt b/agent/util-scripts/gold/pbench-register-tool/test-44.txt index 1e27e69e0c..4b15e6a57b 100644 --- a/agent/util-scripts/gold/pbench-register-tool/test-44.txt +++ b/agent/util-scripts/gold/pbench-register-tool/test-44.txt @@ -19,7 +19,6 @@ Available tools: blktrace bpftrace cpuacct - dcgm disk dm-cache docker @@ -34,7 +33,6 @@ Available tools: kvmtrace lockstat mpstat - node-exporter numastat oc openvswitch @@ -57,6 +55,8 @@ Available tools: user-tool virsh-migrate vmstat + node-exporter + dcgm For a list of tool specific options, run: /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/ --help diff --git a/agent/util-scripts/gold/pbench-register-tool/test-46.txt b/agent/util-scripts/gold/pbench-register-tool/test-46.txt index a4c01d0932..605883fab7 100644 --- a/agent/util-scripts/gold/pbench-register-tool/test-46.txt +++ b/agent/util-scripts/gold/pbench-register-tool/test-46.txt @@ -19,7 +19,6 @@ Available tools: blktrace bpftrace cpuacct - dcgm disk dm-cache docker @@ -34,7 +33,6 @@ Available tools: kvmtrace lockstat mpstat - node-exporter numastat oc openvswitch @@ -57,6 +55,8 @@ Available tools: user-tool virsh-migrate vmstat + node-exporter + dcgm For a list of tool specific options, run: /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/ --help diff --git a/agent/util-scripts/gold/pbench-register-tool/test-47.txt b/agent/util-scripts/gold/pbench-register-tool/test-47.txt index 312bc2db6d..f43f893c7c 100644 --- a/agent/util-scripts/gold/pbench-register-tool/test-47.txt +++ b/agent/util-scripts/gold/pbench-register-tool/test-47.txt @@ -19,7 +19,6 @@ Available tools: blktrace bpftrace cpuacct - dcgm disk dm-cache docker @@ -34,7 +33,6 @@ Available tools: kvmtrace lockstat mpstat - node-exporter numastat oc openvswitch @@ -57,6 +55,8 @@ Available tools: user-tool virsh-migrate vmstat + node-exporter + dcgm For a list of tool specific options, run: /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/ --help diff --git a/agent/util-scripts/pbench-register-tool b/agent/util-scripts/pbench-register-tool index 8b647c733a..994fd40d4a 100755 --- a/agent/util-scripts/pbench-register-tool +++ b/agent/util-scripts/pbench-register-tool @@ -116,9 +116,7 @@ function usage() { printf -- "\tdenoted by a leading hash, or pound (\"#\"), character.\n" printf -- "\nAvailable tools:\n" local tool="" - for tool in $(find ${pbench_bin}/tool-scripts -maxdepth 1 ! -type d ! -name '*README*' ! -name base-tool ! -name unittests -printf "%P\n" 2> /dev/null | sort); do - printf -- "\t${tool}\n" - done + python3 -c "import sys, json; meta = json.load(open(sys.argv[1])); [print(f'\t{tool}') for tool in (*meta['transient'].keys(), *meta['persistent'].keys()) ]" ${pbench_bin}/tool-scripts/meta.json # 1 2 3 4 5 6 7 8 # (no tab) 12345678901234567890123456789012345678901234567890123456789012345678901234567890 printf -- "\nFor a list of tool specific options, run:\n" diff --git a/agent/util-scripts/pbench-tool-meister-start b/agent/util-scripts/pbench-tool-meister-start index 98b1cfac43..9c0c70d658 100755 --- a/agent/util-scripts/pbench-tool-meister-start +++ b/agent/util-scripts/pbench-tool-meister-start @@ -32,6 +32,8 @@ import redis from pbench.agent.tool_data_sink import main as tds_main from pbench.agent.tool_meister import main as tm_main +from pbench.agent import PbenchAgentConfig +from pbench.agent.modules import metaclass # Port number is "One Tool" in hex 0x17001 @@ -369,6 +371,28 @@ def main(argv): ), f"bad channel: {resp!r}" assert resp["data"] == 1, f"bad data: {resp!r}" + # 2.5. Add tool metadata json to redis + try: + inst_dir = PbenchAgentConfig(os.environ["_PBENCH_AGENT_CONFIG"]).pbench_install_dir + except BadConfig as exc: + logger.error("%s", exc) + return 1 + except Exception: + logger.error("Unexpected error encountered logging pbench agent configuration: '%s'", exc) + return 1 + + try: + tm_start_path = Path(inst_dir).resolve(strict=True) + except FileNotFoundError: + logger.error("Unable to determine proper installation directory, '%s' not found", inst_dir) + return 1 + except Exception as exc: + logger.exception("Unexpected error encountered resolving installation directory: '%s'", exc) + return 1 + tool_metadata = metaclass.ToolMetadata("json", tm_start_path, logger) + tool_metadata.loadIntoRedis(redis_server) + + # 3. Start the tool-data-sink process # - leave a PID file for the tool data sink process tds_param_key = "tds-{}".format(group) diff --git a/lib/pbench/agent/modules/metaclass.py b/lib/pbench/agent/modules/metaclass.py new file mode 100644 index 0000000000..e3909f9039 --- /dev/null +++ b/lib/pbench/agent/modules/metaclass.py @@ -0,0 +1,114 @@ +from pathlib import Path +import json +import os + + +class ToolMetadataExc(Exception): + pass + + +class ToolMetadata: + def __init__(self, mode, context, logger): + self.logger = logger + assert mode in ( + "redis", + "json", + ), f"Logic bomb! Unexpected mode, {mode}, encountered constructing tool meta data" + assert ( + context + ), "Logic bomb! No context given on ToolMetadata object construction" + self.mode = mode + if mode == "redis": + self.redis_server = context + self.json_file = None + else: + self.redis_server = None + json_path = Path(context, "tool-scripts", "meta.json") + try: + self.json = json_path.resolve(strict=True) + except FileNotFoundError: + raise ToolMetadataExc(f"missing {json_path}") + except Exception: + raise + self.data = self.__getInitialData() + + def __getInitialData(self): + if self.mode == "json": + if not os.path.isfile(self.json): + self.logger.error( + "There is no tool-scripts/meta.json in given install dir" + ) + return None + with self.json.open("r") as json_file: + metadata = json.load(json_file) + elif self.mode == "redis": + try: + meta_raw = self.redis_server.get("tool-metadata") + except Exception: + self.logger.exception( + "Failure to fetch tool metadata from the Redis server" + ) + raise + else: + if meta_raw is None: + self.logger.error("Metadata has not been loaded into redis yet") + return None + try: + metadata = json.loads(meta_raw.decode("utf-8")) + except Exception as exc: + self.logger.error( + "Bad metadata loaded into Redis server, '%s', json=%r", + exc, + meta_raw, + ) + return None + return metadata + + def __dataCheck(self): + if not self.data: + self.data == self.__getInitialData() + if not self.data: + self.logger.error(f"Unable to access data through {self.mode}") + return 0 + return 1 + + def getFullData(self): + if self.__dataCheck(): + return self.data + return None + + def getPersistentTools(self): + if self.__dataCheck(): + return list(self.data["persistent"].keys()) + return None + + def getTransientTools(self): + if self.__dataCheck(): + return list(self.data["transient"].keys()) + return None + + def getProperties(self, tool): + if tool in self.data["persistent"].keys(): + return self.data["persistent"][tool] + elif tool in self.data["transient"].keys(): + return self.data["transient"][tool] + + def loadIntoRedis(self, info): + if self.mode == "redis": + try: + self.json = Path(info).resolve(strict=True) + except FileNotFoundError: + raise ToolMetadataExc(f"missing {info}") + except Exception: + raise + elif self.mode == "json": + self.redis_server = info + + try: + with self.json.open("r") as json_file: + metadata = json.load(json_file) + self.redis_server.set("tool-metadata", json.dumps(metadata)) + except Exception: + self.logger.error("Failed to load the data into redis") + raise + return None diff --git a/lib/pbench/agent/tool_data_sink.py b/lib/pbench/agent/tool_data_sink.py index 8f875c3b3f..cc5c7ea362 100644 --- a/lib/pbench/agent/tool_data_sink.py +++ b/lib/pbench/agent/tool_data_sink.py @@ -16,7 +16,6 @@ import json import logging import os -import shutil import socket import subprocess import sys @@ -34,6 +33,8 @@ from bottle import Bottle, ServerAdapter, request, abort +from pbench.agent.modules import metaclass + # Read in 64 KB chunks off the wire for HTTP PUT requests. _BUFFER_SIZE = 65536 @@ -70,8 +71,7 @@ class DataSinkWsgiRequestHandler(WSGIRequestHandler): _logger = logger def log_error(self, format_str, *args): - """log_error - log the error message with the client address - """ + """log_error - log the error message with the client address""" self._logger.error( "%s - - %s", self.address_string(), format_str % args ) @@ -85,8 +85,7 @@ def log_message(self, format_str, *args): ) def log_request(self, code="-", size="-"): - """log_request - log the request as an informational message. - """ + """log_request - log the request as an informational message.""" if isinstance(code, HTTPStatus): code = code.value self._logger.info( @@ -116,11 +115,16 @@ def stop(self): class BaseCollector: allowed_tools = {"noop-collector": None} - def __init__(self, benchmark_run_dir, host_tools_dict, logger): + def __init__( + self, benchmark_run_dir, tool_group, host_tools_dict, logger, tool_metadata + ): self.run = None - self.benchmark_run_dir = str(benchmark_run_dir) + self.benchmark_run_dir = benchmark_run_dir + self.tool_group = tool_group self.host_tools_dict = host_tools_dict self.logger = logger + self.tool_metadata = tool_metadata + self.tool_group_dir = self.benchmark_run_dir / f"tools-{self.tool_group}" self.abort_launch = True def launch(self): @@ -136,11 +140,9 @@ def terminate(self): class PromCollector(BaseCollector): - allowed_tools = {"node-exporter": "9100", "dcgm": "8000"} - - def __init__(self, benchmark_run_dir, host_tools_dict, logger): - super().__init__(benchmark_run_dir, host_tools_dict, logger) - self.volume = self.benchmark_run_dir + "/prom_vol" + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.volume = self.tool_group_dir / "prometheus" def launch(self): @@ -167,7 +169,7 @@ def launch(self): host_ip = host for tool in self.host_tools_dict[host]: - port = PromCollector.allowed_tools[tool] + port = self.tool_metadata.getProperties(tool)["port"] config.write( " - job_name: '{}_{}'\n static_configs:\n - targets: ['{}:{}']\n\n".format( host_ip, tool, host_ip, port @@ -184,9 +186,7 @@ def launch(self): return 0 args = ["podman", "pull", "prom/prometheus"] - prom_pull = subprocess.Popen( - args, stdout=prom_logs, stderr=prom_logs - ) # , stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) + prom_pull = subprocess.Popen(args, stdout=prom_logs, stderr=prom_logs) prom_pull.wait() os.mkdir(self.volume) @@ -205,9 +205,7 @@ def launch(self): f"{self.benchmark_run_dir}/tm/prometheus.yml:/etc/prometheus/prometheus.yml:Z", "prom/prometheus", ] - self.run = subprocess.Popen( - args, stdout=prom_logs, stderr=prom_logs - ) # , stdout =subprocess.DEVNULL, stderr=subprocess.STDOUT) + self.run = subprocess.Popen(args, stdout=prom_logs, stderr=prom_logs) prom_logs.close() @@ -219,21 +217,20 @@ def terminate(self): self.logger.debug("PROM TERMINATED") - os.mkdir(str(self.benchmark_run_dir) + "/prom_data") - args = [ "tar", + "--remove-files", + "--exclude", + "prometheus/prometheus_data.tar.gz", "-zcvf", - f"{self.benchmark_run_dir}/prom_data/prometheus_data.tar.gz", + f"{self.volume}/prometheus_data.tar.gz", "-C", - f"{self.benchmark_run_dir}/", - "prom_vol", + f"{self.tool_group_dir}/", + "prometheus", ] data_store = subprocess.Popen(args) data_store.wait() - shutil.rmtree(self.volume) - return 1 @@ -258,6 +255,7 @@ def __init__(self, redis_server, channel, benchmark_run_dir, tool_group, logger) self.state = None self.tool_data_ctx = None self.directory = None + self.tool_metadata = metaclass.ToolMetadata("redis", redis_server, logger) self._data = None self._prom_server = None self._tm_tracking = None @@ -296,8 +294,7 @@ def __init__(self, redis_server, channel, benchmark_run_dir, tool_group, logger) self.web_server_thread = None def run(self): - """run - Start the Bottle web server running and the watcher thread. - """ + """run - Start the Bottle web server running and the watcher thread.""" self.logger.info("Running Bottle web server ...") try: super().run(server=self._server) @@ -307,8 +304,7 @@ def run(self): self.logger.info("Bottle web server exited") def execute(self): - """execute - Start the Bottle web server running and the watcher thread. - """ + """execute - Start the Bottle web server running and the watcher thread.""" self.web_server_thread = Thread(target=self.run) self.web_server_thread.start() self.logger.debug("web server 'run' thread started, processing payloads ...") @@ -419,12 +415,16 @@ def _fetch_tms(self): persistent_tools = [] transient_tools = [] for tool_name in tools.keys(): - if tool_name in PromCollector.allowed_tools: + if tool_name in self.tool_metadata.getPersistentTools(): persistent_tools.append(tool_name) elif tool_name in BaseCollector.allowed_tools: noop_tools.append(tool_name) - else: + elif tool_name in self.tool_metadata.getTransientTools(): transient_tools.append(tool_name) + else: + self.logger.error( + f"Registered tool {tool_name} is not recognized in tool metadata" + ) tm["noop_tools"] = noop_tools tm["persistent_tools"] = persistent_tools tm["transient_tools"] = transient_tools @@ -560,14 +560,23 @@ def state_change(self, data): if self.state == "init": prom_tool_dict = {} for tm in self._tm_tracking: - prom_tools = self._tm_tracking[tm]["persistent_tools"] + prom_tools = [] + persist_tools = self._tm_tracking[tm]["persistent_tools"] + for tool in persist_tools: + tool_data = self.tool_metadata.getProperties(tool) + if tool_data["collector"] == "prometheus": + prom_tools.append(tool) if len(prom_tools) > 0: prom_tool_dict[self._tm_tracking[tm]["hostname"]] = prom_tools self.logger.debug(prom_tool_dict) if prom_tool_dict: self._prom_server = PromCollector( - self.benchmark_run_dir, prom_tool_dict, self.logger + self.benchmark_run_dir, + self.tool_group, + prom_tool_dict, + self.logger, + self.tool_metadata, ) self._prom_server.launch() elif self.state == "end": diff --git a/lib/pbench/agent/tool_meister.py b/lib/pbench/agent/tool_meister.py index 7928d2f9d3..80eea0a7e1 100644 --- a/lib/pbench/agent/tool_meister.py +++ b/lib/pbench/agent/tool_meister.py @@ -59,6 +59,7 @@ import redis from pbench.server.utils import md5sum +from pbench.agent.modules import metaclass # Path to external tar executable. @@ -363,7 +364,8 @@ def fetch_params(params): def __init__(self, pbench_bin, params, redis_server, logger): self.logger = logger - self.persist_tools = ["node-exporter", "dcgm"] + self.tool_metadata = metaclass.ToolMetadata("redis", redis_server, self.logger) + self.persist_tools = self.tool_metadata.getPersistentTools() self.pbench_bin = pbench_bin ret_val = self.fetch_params(params) (