From 4154a5d9b7137804140204cd30a997e945ff18f3 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 14 Apr 2026 11:18:08 -0600 Subject: [PATCH 1/3] enable metrics in the harness worker, and bugfix for metrics expected labels --- Gemfile | 2 ++ Gemfile.lock | 8 ++++++ harness/main.rb | 11 +++++++- harness/manifests/configmap-gcp.yaml | 4 +-- harness/manifests/deployment.yaml | 5 ++++ .../worker/telemetry/prometheus_backend.rb | 26 ++++++++++++------- 6 files changed, 44 insertions(+), 12 deletions(-) diff --git a/Gemfile b/Gemfile index 2875d15..32c9354 100644 --- a/Gemfile +++ b/Gemfile @@ -4,4 +4,6 @@ source 'https://rubygems.org' gemspec +gem 'prometheus-client', '~> 4.0' gem 'rake', '~> 13.0' +gem 'webrick', '~> 1.8' diff --git a/Gemfile.lock b/Gemfile.lock index fbdd72f..65cf156 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -43,6 +43,8 @@ GEM parser (3.3.10.1) ast (~> 2.4.1) racc + prometheus-client (4.2.5) + base64 pry (0.16.0) coderay (~> 1.1) method_source (~> 1.0) @@ -93,12 +95,14 @@ GEM addressable (>= 2.8.0) crack (>= 0.3.2) hashdiff (>= 0.4.0, < 2.0.0) + webrick (1.9.2) PLATFORMS ruby DEPENDENCIES conductor_ruby! + prometheus-client (~> 4.0) pry (~> 0.14) rake (~> 13.0) rspec (~> 3.0) @@ -106,3 +110,7 @@ DEPENDENCIES rubocop-rspec (~> 2.0) vcr (~> 6.0) webmock (~> 3.0) + webrick (~> 1.8) + +BUNDLED WITH + 2.5.22 diff --git a/harness/main.rb b/harness/main.rb index a22f7fa..5227d52 100644 --- a/harness/main.rb +++ b/harness/main.rb @@ -3,6 +3,7 @@ # Load the SDK from source (relative to repo root) $LOAD_PATH.unshift(File.expand_path('../lib', __dir__)) require 'conductor' +require 'conductor/worker/telemetry/prometheus_backend' require_relative 'simulated_task_worker' require_relative 'workflow_governor' @@ -31,9 +32,16 @@ def self.main batch_size = env_int('HARNESS_BATCH_SIZE', 20) poll_interval_ms = env_int('HARNESS_POLL_INTERVAL_MS', 100) + metrics_port = env_int('HARNESS_METRICS_PORT', 9991) + configuration = Conductor::Configuration.new register_metadata(configuration) + metrics_collector = Conductor::Worker::Telemetry::MetricsCollector.new(backend: :prometheus) + metrics_server = Conductor::Worker::Telemetry::MetricsServer.new(port: metrics_port) + metrics_server.start + puts "Prometheus metrics server started on port #{metrics_port}" + workers = SIMULATED_WORKERS.map do |def_entry| sim = SimulatedTaskWorker.new( def_entry[:task_name], @@ -55,7 +63,8 @@ def self.main task_handler = Conductor::Worker::TaskHandler.new( workers: workers, configuration: configuration, - scan_for_annotated_workers: false + scan_for_annotated_workers: false, + event_listeners: [metrics_collector] ) task_handler.start diff --git a/harness/manifests/configmap-gcp.yaml b/harness/manifests/configmap-gcp.yaml index 67ea706..61652d6 100644 --- a/harness/manifests/configmap-gcp.yaml +++ b/harness/manifests/configmap-gcp.yaml @@ -9,5 +9,5 @@ metadata: labels: app: ruby-sdk-harness-worker data: - CONDUCTOR_SERVER_URL: "https://certification-gcp.orkesconductor.com/api" - CONDUCTOR_AUTH_KEY: "e6c1ac61-286b-11f1-be01-c682b5750c3a" + CONDUCTOR_SERVER_URL: "https://certification-gcp.orkesconductor.io/api" + CONDUCTOR_AUTH_KEY: "25b681c1-34ec-11f1-b07a-9601c7a63373" diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index e55654d..b63f23f 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -53,6 +53,11 @@ spec: - name: HARNESS_POLL_INTERVAL_MS value: "100" + ports: + - name: metrics + containerPort: 9991 + protocol: TCP + resources: requests: memory: "256Mi" diff --git a/lib/conductor/worker/telemetry/prometheus_backend.rb b/lib/conductor/worker/telemetry/prometheus_backend.rb index 67a22a0..31cdbd7 100644 --- a/lib/conductor/worker/telemetry/prometheus_backend.rb +++ b/lib/conductor/worker/telemetry/prometheus_backend.rb @@ -73,18 +73,24 @@ def load_prometheus_client "Add `gem 'prometheus-client'` to your Gemfile." end + # Each counter declares only the labels it actually receives + COUNTER_LABELS = { + 'task_poll_total' => %i[task_type], + 'task_poll_error_total' => %i[task_type error], + 'task_execute_error_total' => %i[task_type exception retryable], + 'task_update_failed_total' => %i[task_type] + }.freeze + # Setup predefined metrics def setup_metrics - # Counters @counters = {} @histograms = {} @gauges = {} - # Pre-register common metrics - register_counter('task_poll_total', 'Total number of task polls') - register_counter('task_poll_error_total', 'Total number of poll errors') - register_counter('task_execute_error_total', 'Total number of execution errors') - register_counter('task_update_failed_total', 'Total number of failed task updates (CRITICAL)') + register_counter('task_poll_total', 'Total number of task polls', COUNTER_LABELS['task_poll_total']) + register_counter('task_poll_error_total', 'Total number of poll errors', COUNTER_LABELS['task_poll_error_total']) + register_counter('task_execute_error_total', 'Total number of execution errors', COUNTER_LABELS['task_execute_error_total']) + register_counter('task_update_failed_total', 'Total number of failed task updates (CRITICAL)', COUNTER_LABELS['task_update_failed_total']) register_histogram('task_poll_time_seconds', 'Task poll duration in seconds', TIME_BUCKETS) register_histogram('task_execute_time_seconds', 'Task execution duration in seconds', TIME_BUCKETS) @@ -94,14 +100,15 @@ def setup_metrics # Register a counter metric # @param name [String] Metric name # @param docstring [String] Metric description - def register_counter(name, docstring) + # @param labels [Array] Label keys for this counter + def register_counter(name, docstring, labels = %i[task_type]) metric_name = name.to_sym return if @registry.exist?(metric_name) counter = Prometheus::Client::Counter.new( metric_name, docstring: docstring, - labels: %i[task_type error exception retryable] + labels: labels ) @registry.register(counter) @counters[name] = counter @@ -150,10 +157,11 @@ def get_or_create_counter(name) if @registry.exist?(metric_name) @registry.get(metric_name) else + labels = COUNTER_LABELS.fetch(name, %i[task_type]) counter = Prometheus::Client::Counter.new( metric_name, docstring: "Counter for #{name}", - labels: %i[task_type error exception retryable] + labels: labels ) @registry.register(counter) counter From 16705a204e0fd23873f9f4cbb27c5a93ea294c39 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Fri, 17 Apr 2026 09:10:50 -0600 Subject: [PATCH 2/3] adjust harness image name --- .github/workflows/harness-image.yml | 19 +++++++++++++++++-- harness/manifests/deployment.yaml | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/.github/workflows/harness-image.yml b/.github/workflows/harness-image.yml index b042eeb..500bdf7 100644 --- a/.github/workflows/harness-image.yml +++ b/.github/workflows/harness-image.yml @@ -14,6 +14,11 @@ on: release: types: [published] workflow_dispatch: + inputs: + deploy: + description: "Dispatch downstream deploy after the image is built" + type: boolean + default: true concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -42,13 +47,21 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Calculate branch tag + id: vars + shell: bash + run: | + BRANCH="${{ github.ref_name }}" + CLEANED_BRANCH_NAME=$(echo "$BRANCH" | tr '/' '-' | tr '[:upper:]' '[:lower:]') + echo "cleaned-branch-name=$CLEANED_BRANCH_NAME" >> "$GITHUB_OUTPUT" + - name: Docker metadata id: meta uses: docker/metadata-action@v5 with: images: ghcr.io/conductor-oss/ruby-sdk/harness-worker tags: | - type=raw,value=latest + type=raw,value=${{ steps.vars.outputs.cleaned-branch-name }}-latest,enable=${{ github.event_name != 'release' }} type=raw,value=${{ github.event.release.tag_name }},enable=${{ github.event_name == 'release' }} - name: Build and push @@ -62,7 +75,9 @@ jobs: tags: ${{ steps.meta.outputs.tags }} dispatch-deploy: - if: github.event_name == 'release' + if: | + github.event_name == 'release' || + (github.event_name == 'workflow_dispatch' && inputs.deploy) needs: build-and-push runs-on: ubuntu-latest permissions: diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index b63f23f..25c9c07 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -18,7 +18,7 @@ spec: # note: imagePullSecrets is not needed for public images containers: - name: harness - image: ghcr.io/conductor-oss/ruby-sdk/harness-worker:latest + image: ghcr.io/conductor-oss/ruby-sdk/harness-worker:certification-worker-metrics-latest imagePullPolicy: Always env: # === CONDUCTOR CONNECTION (from per-cloud ConfigMap) === From 2378ecc20b26809238fccfc85553a9c3c9be2e11 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 20 Apr 2026 09:19:00 -0600 Subject: [PATCH 3/3] update harness image to main --- harness/manifests/deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index 25c9c07..0036ec9 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -18,7 +18,7 @@ spec: # note: imagePullSecrets is not needed for public images containers: - name: harness - image: ghcr.io/conductor-oss/ruby-sdk/harness-worker:certification-worker-metrics-latest + image: ghcr.io/conductor-oss/ruby-sdk/harness-worker:main-latest imagePullPolicy: Always env: # === CONDUCTOR CONNECTION (from per-cloud ConfigMap) ===