Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions .github/workflows/harness-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ on:
release:
types: [published]
workflow_dispatch:
inputs:
deploy:
description: "Dispatch downstream deploy after the image is built"
type: boolean
default: true

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down Expand Up @@ -42,13 +47,21 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Calculate branch tag
id: vars
shell: bash
run: |
BRANCH="${{ github.ref_name }}"
CLEANED_BRANCH_NAME=$(echo "$BRANCH" | tr '/' '-' | tr '[:upper:]' '[:lower:]')
echo "cleaned-branch-name=$CLEANED_BRANCH_NAME" >> "$GITHUB_OUTPUT"

- name: Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ghcr.io/conductor-oss/ruby-sdk/harness-worker
tags: |
type=raw,value=latest
type=raw,value=${{ steps.vars.outputs.cleaned-branch-name }}-latest,enable=${{ github.event_name != 'release' }}
type=raw,value=${{ github.event.release.tag_name }},enable=${{ github.event_name == 'release' }}

- name: Build and push
Expand All @@ -62,7 +75,9 @@ jobs:
tags: ${{ steps.meta.outputs.tags }}

dispatch-deploy:
if: github.event_name == 'release'
if: |
github.event_name == 'release' ||
(github.event_name == 'workflow_dispatch' && inputs.deploy)
needs: build-and-push
runs-on: ubuntu-latest
permissions:
Expand Down
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ source 'https://rubygems.org'

gemspec

gem 'prometheus-client', '~> 4.0'
gem 'rake', '~> 13.0'
gem 'webrick', '~> 1.8'
8 changes: 8 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ GEM
parser (3.3.10.1)
ast (~> 2.4.1)
racc
prometheus-client (4.2.5)
base64
pry (0.16.0)
coderay (~> 1.1)
method_source (~> 1.0)
Expand Down Expand Up @@ -93,16 +95,22 @@ GEM
addressable (>= 2.8.0)
crack (>= 0.3.2)
hashdiff (>= 0.4.0, < 2.0.0)
webrick (1.9.2)

PLATFORMS
ruby

DEPENDENCIES
conductor_ruby!
prometheus-client (~> 4.0)
pry (~> 0.14)
rake (~> 13.0)
rspec (~> 3.0)
rubocop (~> 1.0)
rubocop-rspec (~> 2.0)
vcr (~> 6.0)
webmock (~> 3.0)
webrick (~> 1.8)

BUNDLED WITH
2.5.22
11 changes: 10 additions & 1 deletion harness/main.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Load the SDK from source (relative to repo root)
$LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
require 'conductor'
require 'conductor/worker/telemetry/prometheus_backend'
require_relative 'simulated_task_worker'
require_relative 'workflow_governor'

Expand Down Expand Up @@ -31,9 +32,16 @@ def self.main
batch_size = env_int('HARNESS_BATCH_SIZE', 20)
poll_interval_ms = env_int('HARNESS_POLL_INTERVAL_MS', 100)

metrics_port = env_int('HARNESS_METRICS_PORT', 9991)

configuration = Conductor::Configuration.new
register_metadata(configuration)

metrics_collector = Conductor::Worker::Telemetry::MetricsCollector.new(backend: :prometheus)
metrics_server = Conductor::Worker::Telemetry::MetricsServer.new(port: metrics_port)
metrics_server.start
puts "Prometheus metrics server started on port #{metrics_port}"

workers = SIMULATED_WORKERS.map do |def_entry|
sim = SimulatedTaskWorker.new(
def_entry[:task_name],
Expand All @@ -55,7 +63,8 @@ def self.main
task_handler = Conductor::Worker::TaskHandler.new(
workers: workers,
configuration: configuration,
scan_for_annotated_workers: false
scan_for_annotated_workers: false,
event_listeners: [metrics_collector]
)
task_handler.start

Expand Down
4 changes: 2 additions & 2 deletions harness/manifests/configmap-gcp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ metadata:
labels:
app: ruby-sdk-harness-worker
data:
CONDUCTOR_SERVER_URL: "https://certification-gcp.orkesconductor.com/api"
CONDUCTOR_AUTH_KEY: "e6c1ac61-286b-11f1-be01-c682b5750c3a"
CONDUCTOR_SERVER_URL: "https://certification-gcp.orkesconductor.io/api"
CONDUCTOR_AUTH_KEY: "25b681c1-34ec-11f1-b07a-9601c7a63373"
7 changes: 6 additions & 1 deletion harness/manifests/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
# note: imagePullSecrets is not needed for public images
containers:
- name: harness
image: ghcr.io/conductor-oss/ruby-sdk/harness-worker:latest
image: ghcr.io/conductor-oss/ruby-sdk/harness-worker:main-latest
imagePullPolicy: Always
env:
# === CONDUCTOR CONNECTION (from per-cloud ConfigMap) ===
Expand Down Expand Up @@ -53,6 +53,11 @@ spec:
- name: HARNESS_POLL_INTERVAL_MS
value: "100"

ports:
- name: metrics
containerPort: 9991
protocol: TCP

resources:
requests:
memory: "256Mi"
Expand Down
26 changes: 17 additions & 9 deletions lib/conductor/worker/telemetry/prometheus_backend.rb
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,24 @@ def load_prometheus_client
"Add `gem 'prometheus-client'` to your Gemfile."
end

# Each counter declares only the labels it actually receives
COUNTER_LABELS = {
'task_poll_total' => %i[task_type],
'task_poll_error_total' => %i[task_type error],
'task_execute_error_total' => %i[task_type exception retryable],
'task_update_failed_total' => %i[task_type]
}.freeze

# Setup predefined metrics
def setup_metrics
# Counters
@counters = {}
@histograms = {}
@gauges = {}

# Pre-register common metrics
register_counter('task_poll_total', 'Total number of task polls')
register_counter('task_poll_error_total', 'Total number of poll errors')
register_counter('task_execute_error_total', 'Total number of execution errors')
register_counter('task_update_failed_total', 'Total number of failed task updates (CRITICAL)')
register_counter('task_poll_total', 'Total number of task polls', COUNTER_LABELS['task_poll_total'])
register_counter('task_poll_error_total', 'Total number of poll errors', COUNTER_LABELS['task_poll_error_total'])
register_counter('task_execute_error_total', 'Total number of execution errors', COUNTER_LABELS['task_execute_error_total'])
register_counter('task_update_failed_total', 'Total number of failed task updates (CRITICAL)', COUNTER_LABELS['task_update_failed_total'])

register_histogram('task_poll_time_seconds', 'Task poll duration in seconds', TIME_BUCKETS)
register_histogram('task_execute_time_seconds', 'Task execution duration in seconds', TIME_BUCKETS)
Expand All @@ -94,14 +100,15 @@ def setup_metrics
# Register a counter metric
# @param name [String] Metric name
# @param docstring [String] Metric description
def register_counter(name, docstring)
# @param labels [Array<Symbol>] Label keys for this counter
def register_counter(name, docstring, labels = %i[task_type])
metric_name = name.to_sym
return if @registry.exist?(metric_name)

counter = Prometheus::Client::Counter.new(
metric_name,
docstring: docstring,
labels: %i[task_type error exception retryable]
labels: labels
)
@registry.register(counter)
@counters[name] = counter
Expand Down Expand Up @@ -150,10 +157,11 @@ def get_or_create_counter(name)
if @registry.exist?(metric_name)
@registry.get(metric_name)
else
labels = COUNTER_LABELS.fetch(name, %i[task_type])
counter = Prometheus::Client::Counter.new(
metric_name,
docstring: "Counter for #{name}",
labels: %i[task_type error exception retryable]
labels: labels
)
@registry.register(counter)
counter
Expand Down
Loading