From f299d74c0baafd83212d1fe5cf5c88918f434c71 Mon Sep 17 00:00:00 2001 From: Stefan Kaestle Date: Thu, 26 Oct 2023 11:56:32 +0000 Subject: [PATCH] Wrapper around Prometheus and fix Farm deployment scripts --- scalability/common/farm.py | 4 +--- scalability/common/prometheus.py | 25 +++++++++++++++-------- scalability/common/workload_experiment.py | 2 +- scalability/helpers/build-and-run.py | 6 +++--- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/scalability/common/farm.py b/scalability/common/farm.py index 9a35edee35d..6f8ce5620e1 100644 --- a/scalability/common/farm.py +++ b/scalability/common/farm.py @@ -8,7 +8,6 @@ import sys import time import uuid -from pathlib import Path from typing import List import gflags @@ -381,8 +380,7 @@ def prepare_and_register_config_image(self): from common import ictools # Generate config image - p = Path(__file__).parents[2] - path = os.path.join(p, self.artifacts_path, "ic-prep") + path = os.path.join(self.artifacts_path, "ic-prep") FLAGS.ic_prep_bin = path self.ic_config = ictools.ic_prep( subnets=self.ic_node_ipv6s, diff --git a/scalability/common/prometheus.py b/scalability/common/prometheus.py index 592a9de191d..4c46b2acdc2 100644 --- a/scalability/common/prometheus.py +++ b/scalability/common/prometheus.py @@ -18,6 +18,13 @@ "prometheus_url", "https://ic-metrics-prometheus-staging.ch1-obsstage1.dfinity.network", "The URL to the prometheus service." ) +def __json_loads_wrapper(str): + try: + return json.loads(str) + except json.JSONDecodeError: + print(colored("Failed to parse JSON received from Prometheus: " + str, "red")) + raise + class Prometheus(metrics.Metric): """Abstraction for collecting prometheus metrics.""" @@ -129,7 +136,7 @@ def get_http_request_rate_for_timestamp(testnet, load_hosts, timestamp): ) payload = {"time": timestamp, "query": query} - return json.loads(get_prometheus(payload).text) + return __json_loads_wrapper(get_prometheus(payload).text) def get_http_request_rate(testnet, load_hosts, t_start, t_end, request_type="query"): @@ -144,7 +151,7 @@ def get_http_request_rate(testnet, load_hosts, t_start, t_end, request_type="que payload = {"start": t_start, "end": t_end, "step": "10s", "query": query} r = get_prometheus_range(payload) - j = json.loads(r.text) + j = __json_loads_wrapper(r.text) return j @@ -163,7 +170,7 @@ def get_execution_query_latency(testnet, load_hosts, t_start, t_end): print("Prometheus: {}".format(json.dumps(payload, indent=2))) r = get_prometheus_range(payload) - j = json.loads(r.text) + j = __json_loads_wrapper(r.text) return j @@ -175,7 +182,7 @@ def get_canister_install_rate(testnet, hosts, timestamp): q = f'rate(execution_subnet_message_duration_seconds_count{{{common},method_name="ic00_install_code"}}[60s])' payload = {"time": timestamp, "query": q} - return json.loads(get_prometheus(payload).text) + return __json_loads_wrapper(get_prometheus(payload).text) def get_num_canisters_installed(testnet, hosts, timestamp): @@ -185,7 +192,7 @@ def get_num_canisters_installed(testnet, hosts, timestamp): q = f'replicated_state_registered_canisters{{{common},status="running"}}' payload = {"time": timestamp, "query": q} - return json.loads(get_prometheus(payload).text) + return __json_loads_wrapper(get_prometheus(payload).text) def get_xnet_stream_size(testnet, t_start, t_end): @@ -194,7 +201,7 @@ def get_xnet_stream_size(testnet, t_start, t_end): q = f"mr_stream_messages{{{common}}}" payload = {"start": t_start, "end": t_end, "step": "10s", "query": q} r = get_prometheus_range(payload) - return json.loads(r.text) + return __json_loads_wrapper(r.text) def get_http_request_duration(testnet, hosts: List[str], t_start, t_end, request_type="query", step=60): @@ -215,7 +222,7 @@ def get_http_request_duration(testnet, hosts: List[str], t_start, t_end, request } r = get_prometheus_range(payload) - data = json.loads(r.text) + data = __json_loads_wrapper(r.text) print(data) r = parse(data) @@ -242,7 +249,7 @@ def get_finalization_rate(testnet, hosts, t_start, t_end): } r = get_prometheus(payload) print(f"Prometheus response is: {r.text}") - return json.loads(r.text) + return __json_loads_wrapper(r.text) def get_state_sync_duration(testnet, load_hosts, timestamp): @@ -257,7 +264,7 @@ def get_state_sync_duration(testnet, load_hosts, timestamp): payload = {"time": timestamp, "query": query} r = get_prometheus(payload) - j = json.loads(r.text) + j = __json_loads_wrapper(r.text) return j diff --git a/scalability/common/workload_experiment.py b/scalability/common/workload_experiment.py index 1caf7f7a5fb..9cc2b19eb11 100644 --- a/scalability/common/workload_experiment.py +++ b/scalability/common/workload_experiment.py @@ -296,7 +296,7 @@ def __wait_for_quiet(self, max_num_iterations: int = 60, sleep_per_iteration_s: if rate_rps <= self.quiet_rate_rps: recovered = True - except StatisticsError: + except (StatisticsError, json.JSONDecodeError): logging.error(f"Failed to parse prometheus response {r} - {logging.traceback.format_exc()}") time.sleep(sleep_per_iteration_s) diff --git a/scalability/helpers/build-and-run.py b/scalability/helpers/build-and-run.py index df29996223e..e615208cfb8 100644 --- a/scalability/helpers/build-and-run.py +++ b/scalability/helpers/build-and-run.py @@ -29,9 +29,9 @@ def build_icos(): ic_root = get_ic_root() if FLAGS.clean: print(colored("Doing clean build", "green")) - subprocess.check_output(shlex.split("gitlab-ci/container/container-run.sh bazel clean"), cwd=ic_root) + subprocess.check_output(shlex.split("bazel clean"), cwd=ic_root) subprocess.check_output( - shlex.split("gitlab-ci/container/container-run.sh rm -rf $(bazel info repository_cache)"), cwd=ic_root + shlex.split("rm -rf ./$(bazel info repository_cache)"), cwd=ic_root ) else: print( @@ -48,7 +48,7 @@ def build_icos(): subprocess.check_output( shlex.split( - "./gitlab-ci/container/container-run.sh bazel run --config=systest //ic-os/guestos/envs/dev:upload_disk-img" + "bazel run --config=systest //ic-os/guestos/envs/dev:upload_disk-img" ), cwd=ic_root, )