From 10fe923e5cf06f2680807c5a910b9d67bb5d7d22 Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Mon, 27 Apr 2026 19:02:43 +0000 Subject: [PATCH 1/7] vector ldms configuration and deployment --- .../validation_flows/telemetry_validation.py | 41 +++++ .../telemetry/tasks/deploy_vector_ldms.yml | 64 +++++++ .../tasks/derive_sink_support_flags.yml | 46 +++++ .../tasks/generate_telemetry_deployments.yml | 44 +++++ provision/roles/telemetry/tasks/main.yml | 9 + .../telemetry/tasks/telemetry_prereq.yml | 16 ++ .../templates/telemetry/kustomization.yaml.j2 | 13 ++ .../vector/vector-ldms-config.toml.j2 | 169 +++++++++++++++++ .../vector/vector-ldms-configmap.yaml.j2 | 18 ++ .../vector/vector-ldms-deployment.yaml.j2 | 170 ++++++++++++++++++ .../vector/vector-ldms-service.yaml.j2 | 38 ++++ .../vector/vlagent-vector-deployment.yaml.j2 | 79 ++++++++ .../vector/vlagent-vector-service.yaml.j2 | 37 ++++ .../vector/vmagent-vector-deployment.yaml.j2 | 95 ++++++++++ .../vector/vmagent-vector-service.yaml.j2 | 37 ++++ .../telemetry/vector/vmagent-vector.yaml.j2 | 127 +++++++++++++ provision/roles/telemetry/vars/main.yml | 12 +- 17 files changed, 1010 insertions(+), 5 deletions(-) create mode 100644 provision/roles/telemetry/tasks/deploy_vector_ldms.yml create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vector-ldms-configmap.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vector-ldms-service.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vlagent-vector-deployment.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vlagent-vector-service.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vmagent-vector-deployment.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vmagent-vector-service.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/vector/vmagent-vector.yaml.j2 diff --git a/common/library/module_utils/input_validation/validation_flows/telemetry_validation.py b/common/library/module_utils/input_validation/validation_flows/telemetry_validation.py index 4b14351b21..0eaa380986 100644 --- a/common/library/module_utils/input_validation/validation_flows/telemetry_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/telemetry_validation.py @@ -559,8 +559,11 @@ def validate_telemetry_config( # ========================================================================= # Vector-LDMS bridge can only be enabled when LDMS source is enabled vector_ldms_enabled = vector_ldms.get("metrics_enabled", False) + vector_ome_metrics_enabled = vector_ome.get("metrics_enabled", False) + vector_ome_logs_enabled = vector_ome.get("logs_enabled", False) ldms_source_enabled = ldms_source.get("metrics_enabled", False) + # Validation 1: Vector-LDMS requires LDMS source to be enabled if vector_ldms_enabled and not ldms_source_enabled: errors.append(create_error_msg( "telemetry_bridges.vector_ldms.metrics_enabled", @@ -579,6 +582,44 @@ def validate_telemetry_config( f"ldms_source.metrics_enabled={ldms_source_enabled}" ) + # Validation 2: If LDMS source is enabled, Vector-LDMS bridge must also be enabled + # (LDMS only supports Kafka collection, requires Vector bridge to reach VictoriaMetrics) + if ldms_source_enabled and not vector_ldms_enabled: + errors.append(create_error_msg( + "telemetry_sources.ldms.metrics_enabled", + "true", + "LDMS source is enabled but Vector-LDMS bridge is disabled. " + "LDMS metrics can only reach VictoriaMetrics via the Vector-LDMS bridge. " + "If you want to check LDMS Metrics on VicotriaMetircs then:" + "Set telemetry_bridges.vector_ldms.metrics_enabled to true in telemetry_config.yml" + )) + logger.error( + "LDMS source enabled without Vector-LDMS bridge: " + f"ldms_source.metrics_enabled={ldms_source_enabled}, " + f"vector_ldms.metrics_enabled={vector_ldms_enabled}" + ) + + # # Validation 3: Verify Kafka collection target for LDMS + # ldms_collection_targets = ldms_source.get("collection_targets", []) + # if ldms_source_enabled and 'kafka' not in ldms_collection_targets: + # errors.append(create_error_msg( + # "telemetry_sources.ldms.collection_targets", + # str(ldms_collection_targets), + # "LDMS source requires 'kafka' in collection_targets. " + # "LDMS only supports Kafka-based collection." + # )) + # logger.error( + # f"LDMS collection_targets missing 'kafka': {ldms_collection_targets}" + # ) + + # Validation 3: Log Vector-OME bridge status + if vector_ome_metrics_enabled or vector_ome_logs_enabled: + logger.info( + "Vector-OME bridge validation: " + f"metrics_enabled={vector_ome_metrics_enabled}, " + f"logs_enabled={vector_ome_logs_enabled}" + ) + # ========================================================================= # Validate PowerScale telemetry configuration # ========================================================================= diff --git a/provision/roles/telemetry/tasks/deploy_vector_ldms.yml b/provision/roles/telemetry/tasks/deploy_vector_ldms.yml new file mode 100644 index 0000000000..c5b0fc6866 --- /dev/null +++ b/provision/roles/telemetry/tasks/deploy_vector_ldms.yml @@ -0,0 +1,64 @@ +--- +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Deploy Vector-LDMS Pipeline +# Purpose: Kafka-to-VictoriaMetrics ingestion pipeline for LDMS metrics +# Spec Reference: Vector HLD Engineering Spec (ESPEC-VECTOR-2026-001) §4.1.3.6 +# Deployment Flow: U1 (Fresh deployment) and U7 (Configuration change) +# Note: Pre-flight validation removed - Kafka cluster, topics, and secrets are deployed +# via kustomization.yaml AFTER this task generates the Vector manifests. +# Validation happens during Vector pod startup when it connects to Kafka. + +# ============================================================================ +# Render Vector-LDMS ConfigMap (TOML Configuration) +# ============================================================================ +- name: Render Vector-LDMS ConfigMap + ansible.builtin.template: + src: "telemetry/vector/vector-ldms-configmap.yaml.j2" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-configmap.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + register: vector_ldms_configmap_rendered + +- name: Display ConfigMap render result + ansible.builtin.debug: + msg: "✓ Vector-LDMS ConfigMap rendered to {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-configmap.yaml" + +# ============================================================================ +# Render Vector-LDMS Deployment +# ============================================================================ +- name: Render Vector-LDMS Deployment + ansible.builtin.template: + src: "telemetry/vector/vector-ldms-deployment.yaml.j2" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-deployment.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + register: vector_ldms_deployment_rendered + +- name: Display Deployment render result + ansible.builtin.debug: + msg: "✓ Vector-LDMS Deployment rendered to {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-deployment.yaml" + +# ============================================================================ +# Render Vector-LDMS Service +# ============================================================================ +- name: Render Vector-LDMS Service + ansible.builtin.template: + src: "telemetry/vector/vector-ldms-service.yaml.j2" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-service.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + register: vector_ldms_service_rendered + +- name: Display Service render result + ansible.builtin.debug: + msg: "✓ Vector-LDMS Service rendered to {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-service.yaml" diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml index 0a12b5ec0d..319a171dd4 100644 --- a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml +++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml @@ -76,6 +76,52 @@ 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([])) +# ============================================================================= +# VECTOR BRIDGE LOGIC - Determine sink requirements based on Vector bridges +# ============================================================================= +# Vector-LDMS bridge: If enabled, requires Kafka + VictoriaMetrics +# Vector-OME bridge: If metrics enabled, requires Kafka + VictoriaMetrics +# If logs enabled, requires Kafka + VictoriaLogs +# ============================================================================= + +- name: Enable Kafka if Vector-LDMS bridge is enabled (requires LDMS source) + ansible.builtin.set_fact: + kafka_support: true + when: + - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool + - ldms_support | default(false) | bool + +- name: Enable VictoriaMetrics if Vector-LDMS bridge is enabled (requires LDMS source) + ansible.builtin.set_fact: + victoria_metrics_support: true + when: + - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool + - ldms_support | default(false) | bool + +- name: Enable Kafka if Vector-OME metrics bridge is enabled + ansible.builtin.set_fact: + kafka_support: true + when: + - telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool + +- name: Enable VictoriaMetrics if Vector-OME metrics bridge is enabled + ansible.builtin.set_fact: + victoria_metrics_support: true + when: + - telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool + +- name: Enable Kafka if Vector-OME logs bridge is enabled + ansible.builtin.set_fact: + kafka_support: true + when: + - telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool + +- name: Enable VictoriaLogs if Vector-OME logs bridge is enabled + ansible.builtin.set_fact: + victoria_logs_support: true + when: + - telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool + - name: Log derived sink support flags ansible.builtin.debug: msg: > diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index ab4ce84999..a24bcd6a62 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -149,6 +149,50 @@ when: telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool tags: telemetry_deployment +- name: Vector vmagent configuration + when: + - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled or telemetry_config.telemetry_bridges.vector_ome.metrics_enabled + block: + - name: Render vmagent-vector Deployment (write buffer for Vector bridges) + ansible.builtin.template: + src: 'telemetry/vector/vmagent-vector-deployment.yaml.j2' + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vmagent-vector-deployment.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + when: + - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool + - victoria_metrics_support | default(false) | bool + tags: telemetry_deployment + + - name: Render vmagent-vector Service (write buffer for Vector bridges) + ansible.builtin.template: + src: 'telemetry/vector/vmagent-vector-service.yaml.j2' + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vmagent-vector-service.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + when: + - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool + - victoria_metrics_support | default(false) | bool + tags: telemetry_deployment + + - name: Render vlagent-vector Deployment (log write buffer for Vector-OME) + ansible.builtin.template: + src: 'telemetry/vector/vlagent-vector-deployment.yaml.j2' + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vlagent-vector-deployment.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + when: + - telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool + - victoria_logs_support | default(false) | bool + tags: telemetry_deployment + + - name: Render vlagent-vector Service (log write buffer for Vector-OME) + ansible.builtin.template: + src: 'telemetry/vector/vlagent-vector-service.yaml.j2' + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vlagent-vector-service.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + when: + - telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool + - victoria_logs_support | default(false) | bool + tags: telemetry_deployment + - name: Deploy telemetry cleanup script ansible.builtin.template: src: 'telemetry/cleanup_telemetry.sh.j2' diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index e9d017036d..32befa6ba1 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -57,6 +57,15 @@ - name: Generate telemetry deployments ansible.builtin.include_tasks: generate_telemetry_deployments.yml +- name: Deploy Vector-LDMS bridge (Kafka-to-VictoriaMetrics pipeline) + ansible.builtin.include_tasks: deploy_vector_ldms.yml + when: + - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool + - victoria_metrics_support | default(false) | bool + tags: + - telemetry_deployment + - vector_ldms + - name: Configure of k8s telemetry service when: - telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool diff --git a/provision/roles/telemetry/tasks/telemetry_prereq.yml b/provision/roles/telemetry/tasks/telemetry_prereq.yml index 5cc65dc2b3..78776148d3 100644 --- a/provision/roles/telemetry/tasks/telemetry_prereq.yml +++ b/provision/roles/telemetry/tasks/telemetry_prereq.yml @@ -144,3 +144,19 @@ src: telemetry/victoria/victoria-tls-secret.yaml.j2 dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/victoria-tls-secret.yaml" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + +- name: debug + debug: + msg: "{{ telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled }}, {{ telemetry_config.telemetry_bridges.vector_ome.metrics_enabled }}, {{ telemetry_config.telemetry_bridges.vector_ome.logs_enabled }}" + +# Create Vector deployment subdirectory +- name: Create Vector deployment subdirectory + ansible.builtin.file: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector" + state: directory + mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" + when: + - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled or + telemetry_config.telemetry_bridges.vector_ome.metrics_enabled or + telemetry_config.telemetry_bridges.vector_ome.log_enabled + tags: telemetry_deployment diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index 6b8c159a10..b00ff20984 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -54,3 +54,16 @@ resources: - telemetry_cleaner_rbac.yaml - telemetry_pod_cleanup.yaml {% endif %} +{% if telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) and ldms_support | default(false) and kafka_support | default(false) and victoria_metrics_support | default(false) %} +# Vector-LDMS Bridge Resources (Kafka-to-VictoriaMetrics pipeline) + - vector/vmagent-vector-deployment.yaml + - vector/vmagent-vector-service.yaml + - vector/vector-ldms-configmap.yaml + - vector/vector-ldms-deployment.yaml + - vector/vector-ldms-service.yaml +{% endif %} +{% if telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) and victoria_logs_support | default(false) %} +# vlagent-vector: Log write-buffer for Vector-OME (Kafka-to-VictoriaLogs pipeline) + - vector/vlagent-vector-deployment.yaml + - vector/vlagent-vector-service.yaml +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 new file mode 100644 index 0000000000..179290a673 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 @@ -0,0 +1,169 @@ +# Vector-LDMS Configuration +# Purpose: Consume LDMS metrics from Kafka 'ldms' topic, transform to Prometheus format, +# and write to VictoriaMetrics via vmagent-vector buffer agent +# Architecture: Kafka 'ldms' topic → Vector-LDMS → vmagent-vector:8429 → vminsert:8480 +# Q2 Status: Active — Omnia-deployed +# Spec Reference: Vector HLD Engineering Spec (ESPEC-VECTOR-2026-001) §4.1.3.2 + +# ============================================================================ +# Data Directory +# ============================================================================ +data_dir = "/var/lib/vector" + +# ============================================================================ +# SOURCE: Kafka Consumer for LDMS Topic +# ============================================================================ +[sources.kafka_ldms] +type = "kafka" +bootstrap_servers = "kafka-kafka-bootstrap.{{ telemetry_namespace }}.svc.cluster.local:9093" +group_id = "{{ vector.ldms.consumer_group }}" +topics = ["{{ vector.ldms.kafka_topic }}"] +auto_offset_reset = "earliest" +session_timeout_ms = 30000 +fetch_wait_max_ms = 100 + +# Kafka mTLS authentication (reuses kafkapump KafkaUser secret) +[sources.kafka_ldms.tls] +enabled = true +ca_file = "/etc/vector/kafka-certs/ca.crt" +crt_file = "/etc/vector/kafka-certs/user.crt" +key_file = "/etc/vector/kafka-certs/user.key" +verify_certificate = true +verify_hostname = true + +# Decoding: LDMS store_avro_kafka produces JSON messages +# Message format: {"metric_name": "...", "value": ..., "timestamp": ..., "host": "...", "plugin": "...", "component": "..."} +[sources.kafka_ldms.decoding] +codec = "json" + +# ============================================================================ +# TRANSFORM: LDMS Schema Normalizer +# ============================================================================ +# Purpose: Convert LDMS fields to Prometheus label format +# Input: LDMS JSON message from store_avro_kafka plugin +# Output: Prometheus-compatible metric with labels +# Spec Reference: §4.1.3.2 LDMS Data Transformation Chain +[transforms.ldms_schema_normalizer] +type = "remap" +inputs = ["kafka_ldms"] +source = ''' +# Parse LDMS message fields +# LDMS store_avro_kafka produces: {metric_name, value, timestamp, host, plugin, component, ...} + +# Extract metric name and value +metric_name = .metric_name ?? "unknown_metric" +plugin = .plugin ?? "unknown_plugin" +metric_value = to_float!(.value) + +# Build Prometheus metric name: {plugin}_{metric_name} +.__name__ = plugin + "_" + metric_name + +# Map LDMS fields to Prometheus labels +.instance = .host ?? "unknown_host" +.job = "ldms" +.plugin = plugin +.component = .component ?? "unknown_component" + +# Preserve timestamp (convert to nanoseconds if needed) +if exists(.timestamp) { + .timestamp = to_unix_timestamp(.timestamp, unit: "nanoseconds") +} else { + .timestamp = now() +} + +# Set metric value +.value = metric_value + +# Remove original LDMS fields to avoid duplication +del(.metric_name) +del(.host) +''' + +# ============================================================================ +# TRANSFORM: Metric Enricher +# ============================================================================ +# Purpose: Add telemetry pipeline metadata labels +# Spec Reference: §4.1.3.2 metric_enricher transform +[transforms.metric_enricher] +type = "remap" +inputs = ["ldms_schema_normalizer"] +source = ''' +# Add source subsystem label +.source_subsystem = "ldms" + +# Add Kafka topic name for traceability +.topic_name = "{{ vector.ldms.kafka_topic }}" + +# Add ingestion timestamp (when Vector processed the message) +.ingestion_timestamp = format_timestamp!(now(), format: "%+") +''' + +# ============================================================================ +# TRANSFORM: Log to Metric Conversion +# ============================================================================ +# Purpose: Convert Vector internal log events to metrics for monitoring +# This ensures all data flowing through Vector is in metric format +[transforms.log_to_metric] +type = "log_to_metric" +inputs = ["metric_enricher"] + +[[transforms.log_to_metric.metrics]] +type = "gauge" +field = "value" +name = "{{ '{{' }} __name__ {{ '}}' }}" +namespace = "ldms" +tags.instance = "{{ '{{' }} instance {{ '}}' }}" +tags.job = "{{ '{{' }} job {{ '}}' }}" +tags.plugin = "{{ '{{' }} plugin {{ '}}' }}" +tags.component = "{{ '{{' }} component {{ '}}' }}" +tags.source_subsystem = "{{ '{{' }} source_subsystem {{ '}}' }}" +tags.topic_name = "{{ '{{' }} topic_name {{ '}}' }}" + +# ============================================================================ +# SINK: Prometheus Remote Write to vmagent-vector +# ============================================================================ +# Purpose: Write metrics to vmagent-vector buffer agent +# Data flow: Vector-LDMS → vmagent-vector:8429 → vminsert:8480 +# Note: NO vlinsert sink — LDMS is pure metrics (no logs) +# Spec Reference: §4.1.3.3 Internal Interfaces +[sinks.victoria_metrics] +type = "prometheus_remote_write" +inputs = ["log_to_metric"] +endpoint = "http://{{ vector.vmagent_vector.service_name }}.{{ telemetry_namespace }}.svc.cluster.local:{{ vector.vmagent_vector.port }}/api/v1/write" +healthcheck.enabled = true + +# Batch settings for efficient writes +[sinks.victoria_metrics.batch] +max_bytes = 1048576 # 1 MB +timeout_secs = 5 + +# Buffer settings (in-memory before write) +[sinks.victoria_metrics.buffer] +type = "memory" +max_events = 10000 + +# Request settings +[sinks.victoria_metrics.request] +retry_attempts = 5 +retry_max_duration_secs = 300 +timeout_secs = 60 + +# Encoding +[sinks.victoria_metrics.encoding] +codec = "json" + +# ============================================================================ +# INTERNAL TELEMETRY: Vector Self-Monitoring +# ============================================================================ +# Expose Vector internal metrics on port {{ vector.ldms.metrics_port }} +# Scraped by vmagent for Vector pipeline monitoring +[api] +enabled = true +address = "0.0.0.0:{{ vector.ldms.metrics_port }}" +playground = false + +# Health check endpoint on port {{ vector.ldms.health_port }} +# Used by Kubernetes liveness/readiness probes +[api.healthcheck] +enabled = true +address = "0.0.0.0:{{ vector.ldms.health_port }}" diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-configmap.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-configmap.yaml.j2 new file mode 100644 index 0000000000..848f183873 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-configmap.yaml.j2 @@ -0,0 +1,18 @@ +--- +# Vector-LDMS ConfigMap +# Purpose: Store Vector TOML configuration for LDMS pipeline +# Spec Reference: Vector HLD Engineering Spec (ESPEC-VECTOR-2026-001) §4.1.3.2 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ vector.ldms.app_name }}-config + namespace: {{ telemetry_namespace }} + labels: + app: {{ vector.ldms.app_name }} + component: telemetry-bridge + subsystem: ldms + managed-by: omnia +data: + vector.toml: | +{{ lookup('template', 'telemetry/vector/vector-ldms-config.toml.j2') | indent(4, first=True) }} diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 new file mode 100644 index 0000000000..6bf0f909fb --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 @@ -0,0 +1,170 @@ +--- +# Vector-LDMS Deployment +# Purpose: Kafka-to-VictoriaMetrics ingestion pipeline for LDMS metrics +# Architecture: Kafka 'ldms' topic → Vector-LDMS → vmagent-vector:8429 → vminsert:8480 +# Q2 Status: Active — Omnia-deployed +# Spec Reference: Vector HLD Engineering Spec (ESPEC-VECTOR-2026-001) §4.1.1 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ vector.ldms.app_name }} + namespace: {{ telemetry_namespace }} + labels: + app: {{ vector.ldms.app_name }} + component: telemetry-bridge + subsystem: ldms + managed-by: omnia +spec: + replicas: {{ vector.ldms.replicas }} + selector: + matchLabels: + app: {{ vector.ldms.app_name }} + template: + metadata: + labels: + app: {{ vector.ldms.app_name }} + component: telemetry-bridge + subsystem: ldms + annotations: + # Force pod restart on config change (updated by Ansible on config modification) + omnia.dell.com/config-version: "{{ ansible_date_time.iso8601 }}" + spec: + # Security context for non-root execution + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + + containers: + - name: {{ vector.ldms.container_name }} + image: {{ vector.image }} + imagePullPolicy: IfNotPresent + + # Command: Vector with TOML config validation + command: + - "/usr/local/bin/vector" + args: + - "--config" + - "/etc/vector/vector.toml" + + # Resource limits from vars/main.yml + resources: + requests: + memory: {{ vector.ldms.resources.requests.memory }} + cpu: {{ vector.ldms.resources.requests.cpu }} + limits: + memory: {{ vector.ldms.resources.limits.memory }} + cpu: {{ vector.ldms.resources.limits.cpu }} + + # Ports: health check and metrics + ports: + - name: health + containerPort: {{ vector.ldms.health_port }} + protocol: TCP + - name: metrics + containerPort: {{ vector.ldms.metrics_port }} + protocol: TCP + + # Liveness probe: Vector health endpoint + # Spec Reference: §4.1.3.3 Provided Interfaces + livenessProbe: + httpGet: + path: /health + port: {{ vector.ldms.health_port }} + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + + # Readiness probe: Vector health endpoint + readinessProbe: + httpGet: + path: /health + port: {{ vector.ldms.health_port }} + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + + # Volume mounts + volumeMounts: + # Vector TOML configuration + - name: vector-config + mountPath: /etc/vector + readOnly: true + + # Kafka mTLS certificates (kafkapump secret) + # Spec Reference: §4.1.3.3 Kubernetes Secrets + - name: kafka-certs + mountPath: /etc/vector/kafka-certs + readOnly: true + + # Kafka cluster CA certificate + - name: kafka-cluster-ca + mountPath: /etc/vector/kafka-ca + readOnly: true + + # Vector data directory (for internal state) + - name: vector-data + mountPath: /var/lib/vector + + # Environment variables + env: + - name: VECTOR_LOG + value: "info" + - name: VECTOR_THREADS + value: "2" + + # Security context for container + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + + # Volumes + volumes: + # Vector TOML configuration from ConfigMap + - name: vector-config + configMap: + name: {{ vector.ldms.app_name }}-config + items: + - key: vector.toml + path: vector.toml + + # Kafka mTLS certificates from kafkapump secret + # Spec Reference: §4.1.1 Vector-LDMS reuses existing kafkapump KafkaUser + - name: kafka-certs + secret: + secretName: {{ vector.ldms.kafka_user }} + items: + - key: user.crt + path: user.crt + - key: user.key + path: user.key + - key: ca.crt + path: ca.crt + + # Kafka cluster CA certificate + - name: kafka-cluster-ca + secret: + secretName: kafka-cluster-ca-cert + items: + - key: ca.crt + path: ca.crt + + # Vector data directory (emptyDir for ephemeral state) + - name: vector-data + emptyDir: {} + + # Restart policy + restartPolicy: Always + + # DNS policy for Kubernetes service discovery + dnsPolicy: ClusterFirst diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-service.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-service.yaml.j2 new file mode 100644 index 0000000000..fa7e383eb4 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-service.yaml.j2 @@ -0,0 +1,38 @@ +--- +# Vector-LDMS Service +# Purpose: Expose Vector-LDMS health and metrics endpoints for monitoring +# Spec Reference: Vector HLD Engineering Spec (ESPEC-VECTOR-2026-001) §4.1.3.3 + +apiVersion: v1 +kind: Service +metadata: + name: {{ vector.ldms.service_name }} + namespace: {{ telemetry_namespace }} + labels: + app: {{ vector.ldms.app_name }} + component: telemetry-bridge + subsystem: ldms + managed-by: omnia + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "{{ vector.ldms.metrics_port }}" + prometheus.io/path: "/metrics" +spec: + type: ClusterIP + selector: + app: {{ vector.ldms.app_name }} + ports: + # Health check endpoint (Kubernetes probes) + - name: health + port: {{ vector.ldms.health_port }} + targetPort: {{ vector.ldms.health_port }} + protocol: TCP + + # Metrics endpoint (vmagent scrape) + # Spec Reference: §4.1.3.3 Vector internal metrics for self-monitoring + - name: metrics + port: {{ vector.ldms.metrics_port }} + targetPort: {{ vector.ldms.metrics_port }} + protocol: TCP + + sessionAffinity: None diff --git a/provision/roles/telemetry/templates/telemetry/vector/vlagent-vector-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vlagent-vector-deployment.yaml.j2 new file mode 100644 index 0000000000..4e269aae79 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vlagent-vector-deployment.yaml.j2 @@ -0,0 +1,79 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# vlagent-vector Deployment +# Purpose: Dedicated vlagent instance acting as log write-buffer between Vector pods and vlinsert +# Architecture: Vector pods → vlagent-vector:9427 (HTTP JSON Lines) → vlinsert:9428 (HTTP) +# Spec Reference: Vector HLD Engineering Spec §4.1.3.2, §4.3.2 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vlagent-vector + namespace: {{ telemetry_namespace }} + labels: + app: vlagent-vector + component: vector + role: log-write-buffer +spec: + replicas: {{ vector.vlagent_vector.replicas | default(2) }} + selector: + matchLabels: + app: vlagent-vector + template: + metadata: + labels: + app: vlagent-vector + component: vector + role: log-write-buffer + spec: + terminationGracePeriodSeconds: 30 + containers: + - name: vlagent + image: "{{ vector.vlagent_vector.image }}" + args: + - "-httpListenAddr=:9427" + - "-remoteWrite.url={{ vector.vlagent_vector.remote_write_url }}" + - "-remoteWrite.tmpDataPath={{ vector.vlagent_vector.tmp_data_path }}" + ports: + - name: http + containerPort: 9427 + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + resources: + requests: + memory: "{{ vector.vlagent_vector.resources.requests.memory }}" + cpu: "{{ vector.vlagent_vector.resources.requests.cpu }}" + limits: + memory: "{{ vector.vlagent_vector.resources.limits.memory }}" + cpu: "{{ vector.vlagent_vector.resources.limits.cpu }}" + volumeMounts: + - name: buffer + mountPath: {{ vector.vlagent_vector.tmp_data_path }} + volumes: + - name: buffer + emptyDir: {} diff --git a/provision/roles/telemetry/templates/telemetry/vector/vlagent-vector-service.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vlagent-vector-service.yaml.j2 new file mode 100644 index 0000000000..6fe2b99222 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vlagent-vector-service.yaml.j2 @@ -0,0 +1,37 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# vlagent-vector Service +# Purpose: ClusterIP service for vlagent-vector log write-buffer +# Exposes: Port 9427 (HTTP) for JSON Lines from Vector pods +# Spec Reference: Vector HLD Engineering Spec §4.1.3.2 + +apiVersion: v1 +kind: Service +metadata: + name: vlagent-vector + namespace: {{ telemetry_namespace }} + labels: + app: vlagent-vector + component: vector + role: log-write-buffer +spec: + type: ClusterIP + selector: + app: vlagent-vector + ports: + - name: http + port: 9427 + targetPort: 9427 + protocol: TCP diff --git a/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector-deployment.yaml.j2 new file mode 100644 index 0000000000..55bae905a7 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector-deployment.yaml.j2 @@ -0,0 +1,95 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# vmagent-vector Deployment +# Purpose: Dedicated vmagent instance acting as write-buffer between Vector pods and vminsert +# Architecture: Vector pods → vmagent-vector:8429 (HTTP) → vminsert:8480 (TLS) +# Spec Reference: Vector Component Spec (CD-17) §19.1 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vmagent-vector + namespace: {{ telemetry_namespace }} + labels: + app: vmagent-vector + component: vector + role: write-buffer +spec: + replicas: {{ vector.vmagent_vector.replicas | default(2) }} + selector: + matchLabels: + app: vmagent-vector + template: + metadata: + labels: + app: vmagent-vector + component: vector + role: write-buffer + spec: + terminationGracePeriodSeconds: 30 + containers: + - name: vmagent + image: "{{ vector.vmagent_vector.image }}" + args: + - "-httpListenAddr=:8429" + - "-remoteWrite.url={{ vector.vmagent_vector.remote_write_url }}" + - "-remoteWrite.tmpDataPath={{ vector.vmagent_vector.tmp_data_path }}" +{% if victoria_cluster.tls_enabled | default(false) %} + - "-remoteWrite.tlsCAFile=/etc/vmagent/victoria-certs/ca.crt" +{% endif %} + ports: + - name: http + containerPort: 8429 + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + resources: + requests: + memory: "{{ vector.vmagent_vector.resources.requests.memory }}" + cpu: "{{ vector.vmagent_vector.resources.requests.cpu }}" + limits: + memory: "{{ vector.vmagent_vector.resources.limits.memory }}" + cpu: "{{ vector.vmagent_vector.resources.limits.cpu }}" + volumeMounts: + - name: buffer + mountPath: {{ vector.vmagent_vector.tmp_data_path }} +{% if victoria_cluster.tls_enabled | default(false) %} + - name: victoria-certs + mountPath: /etc/vmagent/victoria-certs + readOnly: true +{% endif %} + volumes: + - name: buffer + emptyDir: {} +{% if victoria_cluster.tls_enabled | default(false) %} + - name: victoria-certs + secret: + secretName: victoria-tls-certs + items: + - key: ca.crt + path: ca.crt +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector-service.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector-service.yaml.j2 new file mode 100644 index 0000000000..1b248f7c61 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector-service.yaml.j2 @@ -0,0 +1,37 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# vmagent-vector Service +# Purpose: ClusterIP service for vmagent-vector write-buffer +# Exposes: Port 8429 (HTTP) for prometheus_remote_write from Vector pods +# Spec Reference: Vector Component Spec (CD-17) §19.2 + +apiVersion: v1 +kind: Service +metadata: + name: vmagent-vector + namespace: {{ telemetry_namespace }} + labels: + app: vmagent-vector + component: vector + role: write-buffer +spec: + type: ClusterIP + selector: + app: vmagent-vector + ports: + - name: http + port: 8429 + targetPort: 8429 + protocol: TCP diff --git a/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector.yaml.j2 new file mode 100644 index 0000000000..093e804130 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/vector/vmagent-vector.yaml.j2 @@ -0,0 +1,127 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# vmagent-vector: Dedicated VMAgent for Vector-LDMS/Vector-OME write buffering +# Purpose: Acts as a write buffer between Vector pods and VictoriaMetrics cluster +# Architecture: Vector pods → vmagent-vector:8429 → vminsert:8480 → VictoriaMetrics +# Q2 Status: Active — Omnia-deployed +# Spec Reference: Vector HLD Engineering Spec (ESPEC-VECTOR-2026-001) §4.1.3.3 + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMAgent +metadata: + name: vmagent-vector + namespace: {{ telemetry_namespace }} + labels: + app: vmagent-vector + component: telemetry + managed-by: omnia +spec: + # Replica count: 1 (stateless, can scale if needed) + replicaCount: 1 + + # Image configuration + image: + repository: {{ victoria_cluster.vmagent.image.split(':')[0] }} + tag: {{ victoria_cluster.vmagent.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Resource limits + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + + # Remote write configuration to VictoriaMetrics cluster + remoteWrite: +{% if victoria_deployment_mode == 'cluster' %} + - url: "{{ victoria_url_scheme }}://vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write" +{% else %} + - url: "{{ victoria_url_scheme }}://victoria-metric-0.{{ telemetry_namespace }}.svc.cluster.local:8428/api/v1/write" +{% endif %} +{% if victoria_cluster.tls_enabled | default(false) %} + tlsConfig: + ca: + secret: + name: victoria-tls-certs + key: ca.crt + cert: + secret: + name: victoria-tls-certs + key: tls.crt + keySecret: + name: victoria-tls-certs + key: tls.key +{% endif %} + queueConfig: + maxSamplesPerSend: 10000 + maxShards: 10 + capacity: 100000 + + # Service configuration for prometheus_remote_write receiver + serviceSpec: + metadata: + name: vmagent-vector + spec: + type: ClusterIP + ports: + - name: http + port: 8429 + targetPort: 8429 + protocol: TCP + + # Extra arguments for vmagent + extraArgs: + # Enable prometheus remote write receiver on port 8429 + promscrape.config: "" + remoteWrite.tmpDataPath: "/tmp/vmagent-remotewrite-data" + # Disable scraping (vmagent-vector only receives from Vector, doesn't scrape) + promscrape.config.strictParse: "false" + + # Pod anti-affinity for high availability (optional, single replica for now) + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - vmagent-vector + topologyKey: "kubernetes.io/hostname" + +{% if victoria_cluster.tls_enabled | default(false) %} + # Mount TLS certificates + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: tls.crt + - key: tls.key + path: tls.key + - key: ca.crt + path: ca.crt + + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 1ee185e9ec..5fe515b629 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -148,7 +148,7 @@ victoria_cluster: cpu: "1000m" vmagent: - replicas: 1 + replicas: 2 image: "{{ telemetry_images['victoriametrics/vmagent'] | default('victoriametrics/vmagent:v1.128.0') }}" resources: requests: @@ -206,7 +206,7 @@ victoria_logs_cluster: # VLAgent: Platform-managed log forwarding agent (Deployment managed by operator via VLAgent CR) vlagent: - replicas: 1 + replicas: 2 image: "{{ telemetry_images['victoriametrics/vlagent'] | default('docker.io/victoriametrics/vlagent:v1.49.0') }}" pvc_size: "5Gi" # Buffer storage for retry during vlinsert unavailability resources: @@ -502,12 +502,13 @@ vector: ldms: app_name: "vector-ldms" service_name: "vector-ldms" + container_name: "vector-ldms" kafka_topic: "ldms" consumer_group: "vector-ldms-group" kafka_user: "kafkapump" # Shared with LDMS store_avro_kafka health_port: 8687 metrics_port: 9599 - replicas: 1 + replicas: 2 resources: requests: cpu: "100m" @@ -520,6 +521,7 @@ vector: ome: app_name: "vector-ome" service_name: "vector-ome" + container_name: "vector-ome" # Dynamic pattern based on ome_identifier from telemetry_config.yml # Example: if ome_identifier="ome", pattern="^ome\\..*$" (matches ome.events, ome.alerts, etc.) # Example: if ome_identifier="dell_ome", pattern="^dell_ome\\..*$" (matches dell_ome.events, etc.) @@ -545,7 +547,7 @@ vector: port: 8429 # prometheus_remote_write receiver metrics_port: 8429 # vmagent self-metrics image: "{{ telemetry_images['victoriametrics/vmagent'] | default('docker.io/victoriametrics/vmagent:v1.128.0') }}" - replicas: 1 + replicas: 2 pvc_size: "5Gi" # Disk WAL buffer remote_write_url: "http://vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write" tmp_data_path: "/vmagent-buffer" @@ -565,7 +567,7 @@ vector: port: 9427 # JSON Lines receiver metrics_port: 9427 # vlagent self-metrics image: "{{ telemetry_images['victoriametrics/vlagent'] | default('docker.io/victoriametrics/vlagent:v1.49.0') }}" - replicas: 1 + replicas: 2 pvc_size: "5Gi" # Disk buffer remote_write_url: "http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9428/insert/jsonline" tmp_data_path: "/vlagent-buffer" From 9dfba37e3ecbbeac07133c3c8351bb954df00e9f Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Tue, 28 Apr 2026 12:06:42 +0000 Subject: [PATCH 2/7] vector updates --- .../telemetry/cleanup_telemetry.sh.j2 | 84 +++++++++++++- .../vector/vector-ldms-config.toml.j2 | 104 ++++++------------ .../vector/vector-ldms-deployment.yaml.j2 | 13 +-- 3 files changed, 122 insertions(+), 79 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 index a4b391519f..0470d3269b 100644 --- a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 @@ -18,13 +18,15 @@ # Telemetry Stack Cleanup Script # Removes Kafka, LDMS, iDRAC telemetry, and monitoring resources from the {{ telemetry_namespace }} namespace # -# Usage: ./cleanup_telemetry.sh [kafka] [ldms] [idrac] [victoria] [victorialogs] [powerscale] [all] +# Usage: ./cleanup_telemetry.sh [kafka] [ldms] [idrac] [victoria] [victorialogs] [powerscale] [vector-ldms] [vector-ome] [all] # kafka - Delete Kafka cluster, users, and bridge # ldms - Delete LDMS aggregator and store # idrac - Delete iDRAC telemetry # victoria - Delete VictoriaMetrics monitoring (vmcluster, vmagent) # victorialogs - Delete VictoriaLogs only (vlagent, vlcluster) without affecting VictoriaMetrics # powerscale - Delete PowerScale telemetry (karavi-observability Helm release, CSM Metrics, OTEL Collector) +# vector-ldms - Delete Vector-LDMS bridge (Kafka-to-VictoriaMetrics pipeline) +# vector-ome - Delete Vector-OME bridge (Kafka-to-Victoria pipeline for OME data) # all - Delete everything (default if no arguments) # @@ -39,6 +41,8 @@ CLEAN_IDRAC=false CLEAN_VICTORIA=false CLEAN_VICTORIALOGS=false CLEAN_POWERSCALE=false +CLEAN_VECTOR_LDMS=false +CLEAN_VECTOR_OME=false CLEAN_ALL=false if [ $# -eq 0 ]; then @@ -64,11 +68,17 @@ else powerscale) CLEAN_POWERSCALE=true ;; + vector-ldms) + CLEAN_VECTOR_LDMS=true + ;; + vector-ome) + CLEAN_VECTOR_OME=true + ;; all) CLEAN_ALL=true ;; -h|--help) - echo "Usage: $0 [kafka] [ldms] [idrac] [victoria] [victorialogs] [powerscale] [all]" + echo "Usage: $0 [kafka] [ldms] [idrac] [victoria] [victorialogs] [powerscale] [vector-ldms] [vector-ome] [all]" echo "" echo "Options:" echo " kafka - Delete Kafka cluster, users, and bridge" @@ -77,6 +87,8 @@ else echo " victoria - Delete VictoriaMetrics monitoring (vmcluster, vmagent)" echo " victorialogs - Delete VictoriaLogs only (vlagent, vlcluster) without affecting VictoriaMetrics" echo " powerscale - Delete PowerScale telemetry (karavi-observability Helm release)" + echo " vector-ldms - Delete Vector-LDMS bridge (Kafka-to-VictoriaMetrics pipeline)" + echo " vector-ome - Delete Vector-OME bridge (Kafka-to-Victoria pipeline for OME data)" echo " all - Delete everything (default if no arguments)" echo "" echo "Examples:" @@ -86,6 +98,8 @@ else echo " $0 idrac victoria # Delete only iDRAC and VictoriaMetrics" echo " $0 victorialogs # Delete only VictoriaLogs (keeps VictoriaMetrics running)" echo " $0 powerscale # Delete only PowerScale telemetry" + echo " $0 vector-ldms # Delete only Vector-LDMS bridge" + echo " $0 vector-ome # Delete only Vector-OME bridge" exit 0 ;; *) @@ -105,6 +119,8 @@ if [ "$CLEAN_ALL" = true ]; then CLEAN_VICTORIA=true CLEAN_VICTORIALOGS=true CLEAN_POWERSCALE=true + CLEAN_VECTOR_LDMS=true + CLEAN_VECTOR_OME=true fi echo "==========================================" @@ -119,6 +135,8 @@ echo " iDRAC Telemetry: $([ "$CLEAN_IDRAC" = true ] && echo "YES" || echo "NO" echo " VictoriaMetrics: $([ "$CLEAN_VICTORIA" = true ] && echo "YES" || echo "NO")" echo " VictoriaLogs: $([ "$CLEAN_VICTORIALOGS" = true ] && echo "YES" || echo "NO")" echo " PowerScale Tel.: $([ "$CLEAN_POWERSCALE" = true ] && echo "YES" || echo "NO")" +echo " Vector-LDMS: $([ "$CLEAN_VECTOR_LDMS" = true ] && echo "YES" || echo "NO")" +echo " Vector-OME: $([ "$CLEAN_VECTOR_OME" = true ] && echo "YES" || echo "NO")" echo "" read -p "Continue? (y/N): " -n 1 -r echo @@ -215,6 +233,52 @@ if [ "$CLEAN_LDMS" = true ]; then echo "" fi +if [ "$CLEAN_VECTOR_LDMS" = true ]; then + echo "Step 3a: Delete Vector-LDMS Bridge" + echo "-----------------------------------" + # Vector-LDMS Deployment + delete_resource deployment {{ vector.ldms.app_name }} + + # Vector-LDMS Service + delete_resource service {{ vector.ldms.service_name }} + + # Vector-LDMS ConfigMap + delete_resource configmap {{ vector.ldms.app_name }}-config + + # Delete Vector-LDMS pods + delete_all pod "app={{ vector.ldms.app_name }}" + + echo "Note: kafkapump KafkaUser secret is shared with LDMS store_avro_kafka" + echo " and is NOT deleted during Vector-LDMS cleanup." + + sleep 2 + echo "" +fi + +if [ "$CLEAN_VECTOR_OME" = true ]; then + echo "Step 3b: Delete Vector-OME Bridge" + echo "----------------------------------" + # Vector-OME Deployment + delete_resource deployment {{ vector.ome.app_name }} + + # Vector-OME Service + delete_resource service {{ vector.ome.service_name }} + + # Vector-OME ConfigMap + delete_resource configmap {{ vector.ome.app_name }}-config + + # Delete Vector-OME pods + delete_all pod "app={{ vector.ome.app_name }}" + + # Delete Vector-OME KafkaUser (dedicated, not shared) + delete_resource kafkauser {{ vector.ome.kafka_user }} + + echo "Note: Vector-OME KafkaUser '{{ vector.ome.kafka_user }}' is dedicated and has been deleted." + + sleep 2 + echo "" +fi + if [ "$CLEAN_KAFKA" = true ]; then echo "Step 4: Delete Kafka Users" echo "--------------------------" @@ -515,6 +579,12 @@ fi if [ "$CLEAN_LDMS" = true ]; then kubectl -n $NAMESPACE delete pod -l app=nersc-ldms --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true fi +if [ "$CLEAN_VECTOR_LDMS" = true ]; then + kubectl -n $NAMESPACE delete pod -l app={{ vector.ldms.app_name }} --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true +fi +if [ "$CLEAN_VECTOR_OME" = true ]; then + kubectl -n $NAMESPACE delete pod -l app={{ vector.ome.app_name }} --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true +fi if [ "$CLEAN_IDRAC" = true ]; then kubectl -n $NAMESPACE delete pod -l app=idrac-telemetry --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true fi @@ -551,6 +621,16 @@ if [ "$CLEAN_LDMS" = true ]; then kubectl -n $NAMESPACE get statefulset,pod,configmap -l app=nersc-ldms 2>/dev/null || echo " None" echo "" fi +if [ "$CLEAN_VECTOR_LDMS" = true ]; then + echo "Remaining Vector-LDMS resources:" + kubectl -n $NAMESPACE get deployment,service,configmap,pod -l app={{ vector.ldms.app_name }} 2>/dev/null || echo " None" + echo "" +fi +if [ "$CLEAN_VECTOR_OME" = true ]; then + echo "Remaining Vector-OME resources:" + kubectl -n $NAMESPACE get deployment,service,configmap,pod -l app={{ vector.ome.app_name }} 2>/dev/null || echo " None" + echo "" +fi if [ "$CLEAN_IDRAC" = true ]; then echo "Remaining iDRAC resources:" kubectl -n $NAMESPACE get statefulset,pod,configmap -l app=idrac-telemetry 2>/dev/null || echo " None" diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 index 179290a673..0dce70e14c 100644 --- a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 @@ -25,14 +25,13 @@ fetch_wait_max_ms = 100 # Kafka mTLS authentication (reuses kafkapump KafkaUser secret) [sources.kafka_ldms.tls] enabled = true -ca_file = "/etc/vector/kafka-certs/ca.crt" +ca_file = "/etc/vector/kafka-ca/ca.crt" crt_file = "/etc/vector/kafka-certs/user.crt" key_file = "/etc/vector/kafka-certs/user.key" verify_certificate = true verify_hostname = true # Decoding: LDMS store_avro_kafka produces JSON messages -# Message format: {"metric_name": "...", "value": ..., "timestamp": ..., "host": "...", "plugin": "...", "component": "..."} [sources.kafka_ldms.decoding] codec = "json" @@ -41,83 +40,59 @@ codec = "json" # ============================================================================ # Purpose: Convert LDMS fields to Prometheus label format # Input: LDMS JSON message from store_avro_kafka plugin -# Output: Prometheus-compatible metric with labels +# Output: Normalized log event with Prometheus-compatible field names # Spec Reference: §4.1.3.2 LDMS Data Transformation Chain [transforms.ldms_schema_normalizer] type = "remap" inputs = ["kafka_ldms"] source = ''' -# Parse LDMS message fields -# LDMS store_avro_kafka produces: {metric_name, value, timestamp, host, plugin, component, ...} +# Extract metric name and plugin using string coalescing (not ?? which is for errors) +metric_name = to_string(.metric_name) ?? "unknown_metric" +plugin = to_string(.plugin) ?? "unknown_plugin" -# Extract metric name and value -metric_name = .metric_name ?? "unknown_metric" -plugin = .plugin ?? "unknown_plugin" -metric_value = to_float!(.value) +# Extract numeric value +metric_value = to_float(.value) ?? 0.0 # Build Prometheus metric name: {plugin}_{metric_name} -.__name__ = plugin + "_" + metric_name +.metric_name_full = plugin + "_" + metric_name # Map LDMS fields to Prometheus labels -.instance = .host ?? "unknown_host" +.instance = to_string(.host) ?? "unknown_host" .job = "ldms" -.plugin = plugin -.component = .component ?? "unknown_component" - -# Preserve timestamp (convert to nanoseconds if needed) -if exists(.timestamp) { - .timestamp = to_unix_timestamp(.timestamp, unit: "nanoseconds") -} else { - .timestamp = now() -} +.plugin_name = plugin +.component_name = to_string(.component) ?? "unknown_component" +.source_subsystem = "ldms" +.topic_name = "{{ vector.ldms.kafka_topic }}" -# Set metric value -.value = metric_value +# Set metric value as float +.gauge_value = metric_value -# Remove original LDMS fields to avoid duplication +# Clean up original fields del(.metric_name) del(.host) -''' - -# ============================================================================ -# TRANSFORM: Metric Enricher -# ============================================================================ -# Purpose: Add telemetry pipeline metadata labels -# Spec Reference: §4.1.3.2 metric_enricher transform -[transforms.metric_enricher] -type = "remap" -inputs = ["ldms_schema_normalizer"] -source = ''' -# Add source subsystem label -.source_subsystem = "ldms" - -# Add Kafka topic name for traceability -.topic_name = "{{ vector.ldms.kafka_topic }}" - -# Add ingestion timestamp (when Vector processed the message) -.ingestion_timestamp = format_timestamp!(now(), format: "%+") +del(.plugin) +del(.component) +del(.value) ''' # ============================================================================ # TRANSFORM: Log to Metric Conversion # ============================================================================ -# Purpose: Convert Vector internal log events to metrics for monitoring -# This ensures all data flowing through Vector is in metric format +# Purpose: Convert log events to Prometheus gauge metrics +# Required because prometheus_remote_write sink only accepts metric events [transforms.log_to_metric] type = "log_to_metric" -inputs = ["metric_enricher"] - -[[transforms.log_to_metric.metrics]] -type = "gauge" -field = "value" -name = "{{ '{{' }} __name__ {{ '}}' }}" -namespace = "ldms" -tags.instance = "{{ '{{' }} instance {{ '}}' }}" -tags.job = "{{ '{{' }} job {{ '}}' }}" -tags.plugin = "{{ '{{' }} plugin {{ '}}' }}" -tags.component = "{{ '{{' }} component {{ '}}' }}" -tags.source_subsystem = "{{ '{{' }} source_subsystem {{ '}}' }}" -tags.topic_name = "{{ '{{' }} topic_name {{ '}}' }}" +inputs = ["ldms_schema_normalizer"] + + [[transforms.log_to_metric.metrics]] + type = "gauge" + field = "gauge_value" + name = "{% raw %}{{metric_name_full}}{% endraw %}" + tags.instance = "{% raw %}{{instance}}{% endraw %}" + tags.job = "{% raw %}{{job}}{% endraw %}" + tags.plugin = "{% raw %}{{plugin_name}}{% endraw %}" + tags.component = "{% raw %}{{component_name}}{% endraw %}" + tags.source_subsystem = "{% raw %}{{source_subsystem}}{% endraw %}" # ============================================================================ # SINK: Prometheus Remote Write to vmagent-vector @@ -134,7 +109,7 @@ healthcheck.enabled = true # Batch settings for efficient writes [sinks.victoria_metrics.batch] -max_bytes = 1048576 # 1 MB +max_bytes = 1048576 timeout_secs = 5 # Buffer settings (in-memory before write) @@ -148,22 +123,13 @@ retry_attempts = 5 retry_max_duration_secs = 300 timeout_secs = 60 -# Encoding -[sinks.victoria_metrics.encoding] -codec = "json" - # ============================================================================ # INTERNAL TELEMETRY: Vector Self-Monitoring # ============================================================================ -# Expose Vector internal metrics on port {{ vector.ldms.metrics_port }} +# Expose Vector internal metrics and health on port {{ vector.ldms.metrics_port }} +# Vector serves both /health and /metrics on the same API endpoint # Scraped by vmagent for Vector pipeline monitoring [api] enabled = true address = "0.0.0.0:{{ vector.ldms.metrics_port }}" playground = false - -# Health check endpoint on port {{ vector.ldms.health_port }} -# Used by Kubernetes liveness/readiness probes -[api.healthcheck] -enabled = true -address = "0.0.0.0:{{ vector.ldms.health_port }}" diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 index 6bf0f909fb..9f030c77dd 100644 --- a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 @@ -59,31 +59,28 @@ spec: memory: {{ vector.ldms.resources.limits.memory }} cpu: {{ vector.ldms.resources.limits.cpu }} - # Ports: health check and metrics + # Ports: metrics and health (Vector serves both on same port) ports: - - name: health - containerPort: {{ vector.ldms.health_port }} - protocol: TCP - name: metrics containerPort: {{ vector.ldms.metrics_port }} protocol: TCP - # Liveness probe: Vector health endpoint + # Liveness probe: Vector health endpoint (served on metrics port) # Spec Reference: §4.1.3.3 Provided Interfaces livenessProbe: httpGet: path: /health - port: {{ vector.ldms.health_port }} + port: {{ vector.ldms.metrics_port }} initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 3 - # Readiness probe: Vector health endpoint + # Readiness probe: Vector health endpoint (served on metrics port) readinessProbe: httpGet: path: /health - port: {{ vector.ldms.health_port }} + port: {{ vector.ldms.metrics_port }} initialDelaySeconds: 10 periodSeconds: 5 timeoutSeconds: 3 From 5132b0efff9688deab4aa0c0976ce9bd8b832970 Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Thu, 30 Apr 2026 12:24:36 +0000 Subject: [PATCH 3/7] vector-ldms metrics chnages and image change --- .../config/x86_64/rhel/10.0/service_k8s.json | 2 +- .../tasks/deploy_telemetry_manifests.yml | 58 ++++++++ .../tasks/derive_sink_support_flags.yml | 12 ++ provision/roles/telemetry/tasks/main.yml | 11 ++ .../vector/vector-ldms-config.toml.j2 | 137 +++++++++++------- .../vector/vector-ldms-deployment.yaml.j2 | 17 ++- provision/roles/telemetry/vars/main.yml | 2 +- 7 files changed, 179 insertions(+), 60 deletions(-) create mode 100644 provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 313fa782d0..d0e2f02f25 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -43,7 +43,7 @@ { "package": "docker.io/victoriametrics/operator", "tag": "v0.68.3", "type": "image" }, { "package": "docker.io/victoriametrics/operator", "tag": "config-reloader-v0.68.3", "type": "image" }, { "package": "victoria-metrics-operator-0.59.3", "type": "tarball", "url": "https://github.com/VictoriaMetrics/helm-charts/releases/download/victoria-metrics-operator-0.59.3/victoria-metrics-operator-0.59.3.tgz" }, - { "package": "docker.io/timberio/vector", "tag": "0.54.0-alpine", "type": "image" }, + { "package": "docker.io/timberio/vector", "tag": "0.54.0-debian", "type": "image" }, { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] diff --git a/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml b/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml new file mode 100644 index 0000000000..6bdff200c2 --- /dev/null +++ b/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml @@ -0,0 +1,58 @@ +--- +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Deploy Telemetry Manifests +# Purpose: Apply generated Kubernetes manifests to deploy telemetry stack +# This task replicates the functionality of telemetry.sh when running provision.yml directly + +- name: Create telemetry deployment script + ansible.builtin.template: + src: "{{ role_path }}/../configure_ochami/templates/telemetry/telemetry.sh.j2" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh" + mode: "{{ hostvars['localhost']['file_permissions_755'] }}" + vars: + k8s_client_mount_path: "{{ hostvars['localhost']['k8s_client_share_path'] }}" + +- name: Display telemetry deployment script location + ansible.builtin.debug: + msg: "✓ Telemetry deployment script created at {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh" + +- name: Execute telemetry deployment script + ansible.builtin.shell: | + cd {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry + ./telemetry.sh + register: telemetry_deploy_result + when: + - auto_deploy_telemetry | default(false) | bool + +- name: Display deployment result + ansible.builtin.debug: + msg: | + Telemetry deployment script executed. + To manually deploy telemetry stack, run: + {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh + when: + - auto_deploy_telemetry | default(false) | bool + - telemetry_deploy_result is defined + +- name: Instructions for manual deployment + ansible.builtin.debug: + msg: | + Telemetry manifests generated successfully. + To deploy the telemetry stack, run one of: + 1. {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh + 2. kubectl apply -k {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/ + when: + - not (auto_deploy_telemetry | default(false) | bool) diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml index 319a171dd4..36d0a78bcb 100644 --- a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml +++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml @@ -36,15 +36,18 @@ victoria_metrics_support: false victoria_logs_support: false kafka_support: false + cacheable: true - name: Set ldms_support based on telemetry_config.yml ansible.builtin.set_fact: ldms_support: "{{ telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool }}" + cacheable: true - name: Map telemetry_sources to legacy feature flags ansible.builtin.set_fact: idrac_telemetry_support: "{{ telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool }}" dcgm_support: "{{ telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(true) | bool }}" + cacheable: true - name: Map powerscale source + configurations to legacy powerscale_configurations ansible.builtin.set_fact: @@ -59,6 +62,7 @@ - name: Check if any source targets victoria_metrics ansible.builtin.set_fact: victoria_metrics_support: true + cacheable: true when: >- 'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or 'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) @@ -66,12 +70,14 @@ - name: Check if any source targets victoria_logs ansible.builtin.set_fact: victoria_logs_support: true + cacheable: true when: >- 'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) - name: Check if any source targets Kafka ansible.builtin.set_fact: kafka_support: true + cacheable: true when: >- 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([])) @@ -87,6 +93,7 @@ - name: Enable Kafka if Vector-LDMS bridge is enabled (requires LDMS source) ansible.builtin.set_fact: kafka_support: true + cacheable: true when: - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool - ldms_support | default(false) | bool @@ -94,6 +101,7 @@ - name: Enable VictoriaMetrics if Vector-LDMS bridge is enabled (requires LDMS source) ansible.builtin.set_fact: victoria_metrics_support: true + cacheable: true when: - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool - ldms_support | default(false) | bool @@ -101,24 +109,28 @@ - name: Enable Kafka if Vector-OME metrics bridge is enabled ansible.builtin.set_fact: kafka_support: true + cacheable: true when: - telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool - name: Enable VictoriaMetrics if Vector-OME metrics bridge is enabled ansible.builtin.set_fact: victoria_metrics_support: true + cacheable: true when: - telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool - name: Enable Kafka if Vector-OME logs bridge is enabled ansible.builtin.set_fact: kafka_support: true + cacheable: true when: - telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool - name: Enable VictoriaLogs if Vector-OME logs bridge is enabled ansible.builtin.set_fact: victoria_logs_support: true + cacheable: true when: - telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 32befa6ba1..15ce8eebe2 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -94,6 +94,17 @@ - ldms_support - pxe_changed | default(false) | bool +# - name: Deploy telemetry manifests (create telemetry.sh script) +# ansible.builtin.include_tasks: deploy_telemetry_manifests.yml +# when: +# - >- +# (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or +# (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or +# (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or +# ldms_support | default(false) | bool +# tags: +# - telemetry_deployment + - name: Apply telemetry configurations on upgrade ansible.builtin.include_tasks: apply_telemetry_on_upgrade.yml when: diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 index 0dce70e14c..a26719b6f4 100644 --- a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-config.toml.j2 @@ -31,68 +31,98 @@ key_file = "/etc/vector/kafka-certs/user.key" verify_certificate = true verify_hostname = true -# Decoding: LDMS store_avro_kafka produces JSON messages +# Decoding: LDMS store_avro_kafka produces flat JSON messages +# Message format: {"hostname":"node01","instance":"node01/loadavg","component_id":0,"load1min":0.5,...} +# Each message has metadata fields + multiple numeric metric fields (one per sampler metric) [sources.kafka_ldms.decoding] codec = "json" # ============================================================================ -# TRANSFORM: LDMS Schema Normalizer +# TRANSFORM: LDMS Metric Fan-Out (Lua) # ============================================================================ -# Purpose: Convert LDMS fields to Prometheus label format -# Input: LDMS JSON message from store_avro_kafka plugin -# Output: Normalized log event with Prometheus-compatible field names -# Spec Reference: §4.1.3.2 LDMS Data Transformation Chain -[transforms.ldms_schema_normalizer] -type = "remap" +# Purpose: Fan out flat LDMS messages into individual metric events +# +# LDMS store_avro_kafka produces one Kafka message per sampler per interval. +# Each message is a FLAT JSON with metadata + multiple numeric metric fields: +# {"hostname":"node01","instance":"node01/loadavg","component_id":0, +# "load1min":0.5,"load5min":0.3,"load15min":0.2,...} +# +# Vector's remap transform outputs exactly ONE event per input. +# Lua's emit() enables the required one-to-many fan-out: +# 1 input message → N output events (one per numeric metric field) +# +# Output events: {MetricName, MetricValue, instance, job, plugin_name, component_id} +# These feed into log_to_metric for conversion to Prometheus gauges. +[transforms.ldms_fan_out] +type = "lua" +version = "2" inputs = ["kafka_ldms"] +hooks.process = "process" source = ''' -# Extract metric name and plugin using string coalescing (not ?? which is for errors) -metric_name = to_string(.metric_name) ?? "unknown_metric" -plugin = to_string(.plugin) ?? "unknown_plugin" - -# Extract numeric value -metric_value = to_float(.value) ?? 0.0 - -# Build Prometheus metric name: {plugin}_{metric_name} -.metric_name_full = plugin + "_" + metric_name - -# Map LDMS fields to Prometheus labels -.instance = to_string(.host) ?? "unknown_host" -.job = "ldms" -.plugin_name = plugin -.component_name = to_string(.component) ?? "unknown_component" -.source_subsystem = "ldms" -.topic_name = "{{ vector.ldms.kafka_topic }}" - -# Set metric value as float -.gauge_value = metric_value - -# Clean up original fields -del(.metric_name) -del(.host) -del(.plugin) -del(.component) -del(.value) +-- Numeric metadata fields to exclude from metric fan-out. +-- String fields (hostname, instance, source_type, topic, etc.) are excluded +-- automatically by the type(v) == "number" check. +local skip = { + timestamp = true, + component_id = true, + job_id = true, + app_id = true, + offset = true, + partition = true, +} + +function process(event, emit) + local log = event.log + local hostname = log.hostname or "unknown" + + -- Extract schema from instance: "host.test/loadavg" → "loadavg" + local inst = tostring(log.instance or "") + local schema = inst:match("/([^/]+)$") or "unknown" + + local comp = tostring(log.component_id or "") + + -- Fan out: emit one metric event per numeric field + for k, v in pairs(log) do + if not skip[k] and type(v) == "number" then + emit({ + log = { + MetricName = "ldms_" .. schema .. "_" .. k, + MetricValue = v, + instance = hostname, + job = "ldms", + plugin_name = schema, + component_id = comp, + } + }) + end + end +end ''' # ============================================================================ -# TRANSFORM: Log to Metric Conversion +# TRANSFORM: Log-to-Metric Conversion (MANDATORY for prometheus_remote_write) # ============================================================================ -# Purpose: Convert log events to Prometheus gauge metrics -# Required because prometheus_remote_write sink only accepts metric events -[transforms.log_to_metric] +# Purpose: Convert fan-out log events to Prometheus gauge metrics +# The prometheus_remote_write sink ONLY accepts metric-type events. +# Without this transform, all events are silently dropped by the sink. +[transforms.ldms_to_metrics] type = "log_to_metric" -inputs = ["ldms_schema_normalizer"] +inputs = ["ldms_fan_out"] - [[transforms.log_to_metric.metrics]] +{% raw %} + [[transforms.ldms_to_metrics.metrics]] type = "gauge" - field = "gauge_value" - name = "{% raw %}{{metric_name_full}}{% endraw %}" - tags.instance = "{% raw %}{{instance}}{% endraw %}" - tags.job = "{% raw %}{{job}}{% endraw %}" - tags.plugin = "{% raw %}{{plugin_name}}{% endraw %}" - tags.component = "{% raw %}{{component_name}}{% endraw %}" - tags.source_subsystem = "{% raw %}{{source_subsystem}}{% endraw %}" + field = "MetricValue" + name = "{{ MetricName }}" + + [transforms.ldms_to_metrics.metrics.tags] + instance = "{{ instance }}" + job = "{{ job }}" + plugin = "{{ plugin_name }}" + component = "{{ component_id }}" +{% endraw %} + source_subsystem = "ldms" + topic_name = "{{ vector.ldms.kafka_topic }}" # ============================================================================ # SINK: Prometheus Remote Write to vmagent-vector @@ -103,13 +133,15 @@ inputs = ["ldms_schema_normalizer"] # Spec Reference: §4.1.3.3 Internal Interfaces [sinks.victoria_metrics] type = "prometheus_remote_write" -inputs = ["log_to_metric"] +inputs = ["ldms_to_metrics"] endpoint = "http://{{ vector.vmagent_vector.service_name }}.{{ telemetry_namespace }}.svc.cluster.local:{{ vector.vmagent_vector.port }}/api/v1/write" -healthcheck.enabled = true +# Healthcheck disabled: vmagent returns 204 No Content which Vector +# treats as an error (expects 200). This is benign — vmagent is healthy. +healthcheck.enabled = false # Batch settings for efficient writes [sinks.victoria_metrics.batch] -max_bytes = 1048576 +max_bytes = 1048576 # 1 MB timeout_secs = 5 # Buffer settings (in-memory before write) @@ -123,6 +155,9 @@ retry_attempts = 5 retry_max_duration_secs = 300 timeout_secs = 60 +# Encoding: prometheus_remote_write uses protobuf by default (correct for VictoriaMetrics) +# No explicit encoding needed + # ============================================================================ # INTERNAL TELEMETRY: Vector Self-Monitoring # ============================================================================ diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 index 9f030c77dd..a973c7266d 100644 --- a/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ldms-deployment.yaml.j2 @@ -27,8 +27,8 @@ spec: component: telemetry-bridge subsystem: ldms annotations: - # Force pod restart on config change (updated by Ansible on config modification) - omnia.dell.com/config-version: "{{ ansible_date_time.iso8601 }}" + # Force pod restart on config change (Ansible deployment timestamp) + deployment.omnia/timestamp: "{{ ansible_date_time.iso8601 }}" spec: # Security context for non-root execution securityContext: @@ -45,7 +45,7 @@ spec: # Command: Vector with TOML config validation command: - - "/usr/local/bin/vector" + - "/usr/bin/vector" args: - "--config" - "/etc/vector/vector.toml" @@ -59,14 +59,17 @@ spec: memory: {{ vector.ldms.resources.limits.memory }} cpu: {{ vector.ldms.resources.limits.cpu }} - # Ports: metrics and health (Vector serves both on same port) + # Ports: health check and metrics ports: + - name: health + containerPort: {{ vector.ldms.health_port }} + protocol: TCP - name: metrics containerPort: {{ vector.ldms.metrics_port }} protocol: TCP - # Liveness probe: Vector health endpoint (served on metrics port) - # Spec Reference: §4.1.3.3 Provided Interfaces + # Liveness probe: Vector health endpoint + # Vector serves /health and /metrics on the same API port (metrics_port) livenessProbe: httpGet: path: /health @@ -76,7 +79,7 @@ spec: timeoutSeconds: 5 failureThreshold: 3 - # Readiness probe: Vector health endpoint (served on metrics port) + # Readiness probe: Vector health endpoint readinessProbe: httpGet: path: /health diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 5fe515b629..9bf5df2778 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -496,7 +496,7 @@ ps_dependency_fail_msg: >- # Vector image (shared by all Vector pods: vector-ldms, vector-ome) # Registered in service_k8s.json vector: - image: "{{ telemetry_images['timberio/vector'] | default('docker.io/timberio/vector:0.54.0-alpine') }}" + image: "{{ telemetry_images['timberio/vector'] | default('docker.io/timberio/vector:0.54.0-debian') }}" # Vector-LDMS configuration ldms: From b8a07c56f708c934b96cb6bc16fb617275e8b80c Mon Sep 17 00:00:00 2001 From: Kratika Patidar Date: Thu, 30 Apr 2026 18:07:49 +0530 Subject: [PATCH 4/7] Update telemetry_prereq.yml Signed-off-by: Kratika Patidar --- provision/roles/telemetry/tasks/telemetry_prereq.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provision/roles/telemetry/tasks/telemetry_prereq.yml b/provision/roles/telemetry/tasks/telemetry_prereq.yml index 0c549fb831..c3f11151b1 100644 --- a/provision/roles/telemetry/tasks/telemetry_prereq.yml +++ b/provision/roles/telemetry/tasks/telemetry_prereq.yml @@ -144,7 +144,7 @@ src: telemetry/victoria/victoria-tls-secret.yaml.j2 dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/victoria-tls-secret.yaml" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" - + # Create Vector deployment subdirectory # - name: Create Vector deployment subdirectory # ansible.builtin.file: From 2cf1ad8f57b73ae0777206d2314cb1caa78b8d88 Mon Sep 17 00:00:00 2001 From: Kratika Patidar Date: Mon, 4 May 2026 10:39:52 +0530 Subject: [PATCH 5/7] Set changed_when to false for telemetry deployment Prevent change detection for telemetry deployment. Signed-off-by: Kratika Patidar --- provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml b/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml index 6bdff200c2..d422028e62 100644 --- a/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml +++ b/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml @@ -34,6 +34,7 @@ cd {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry ./telemetry.sh register: telemetry_deploy_result + changed_when: false when: - auto_deploy_telemetry | default(false) | bool From ca9e21d9ac4cf32fd80f16f2634102ef39e4e226 Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Mon, 4 May 2026 14:19:50 +0000 Subject: [PATCH 6/7] vecotr-ldms review comments --- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 10 +--- .../tasks/deploy_telemetry_manifests.yml | 59 ------------------- .../telemetry/tasks/deploy_vector_ldms.yml | 12 ---- .../tasks/derive_sink_support_flags.yml | 25 +------- .../tasks/generate_telemetry_deployments.yml | 1 - .../tasks/generate_telemetry_script.yml | 34 +++++++++++ provision/roles/telemetry/tasks/main.yml | 30 ++++++---- .../telemetry/tasks/telemetry_prereq.yml | 20 +++---- .../templates/telemetry/telemetry.sh.j2 | 0 .../victoria/vmagent-scrape-config.yaml.j2 | 35 +++++++++++ 10 files changed, 103 insertions(+), 123 deletions(-) delete mode 100644 provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml create mode 100644 provision/roles/telemetry/tasks/generate_telemetry_script.yml rename provision/roles/{configure_ochami => telemetry}/templates/telemetry/telemetry.sh.j2 (100%) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 0d01edee47..21a5900e63 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -332,14 +332,6 @@ ipAddressPools: - first-pool -{% if idrac_telemetry_support or ldms_support %} - - path: /root/telemetry.sh - owner: root:root - permissions: '0755' - content: | - {{ lookup('template', 'templates/telemetry/telemetry.sh.j2') | indent(12) }} -{% endif %} - runcmd: - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" @@ -1048,7 +1040,7 @@ {% if idrac_telemetry_support or ldms_support %} echo "Applying Telemetry Kubernetes deployments" - /root/telemetry.sh + {{ k8s_client_mount_path }}/telemetry/telemetry.sh {% endif %} {% if powerscale_log_enabled | default(false) | bool %} diff --git a/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml b/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml deleted file mode 100644 index d422028e62..0000000000 --- a/provision/roles/telemetry/tasks/deploy_telemetry_manifests.yml +++ /dev/null @@ -1,59 +0,0 @@ ---- -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Deploy Telemetry Manifests -# Purpose: Apply generated Kubernetes manifests to deploy telemetry stack -# This task replicates the functionality of telemetry.sh when running provision.yml directly - -- name: Create telemetry deployment script - ansible.builtin.template: - src: "{{ role_path }}/../configure_ochami/templates/telemetry/telemetry.sh.j2" - dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh" - mode: "{{ hostvars['localhost']['file_permissions_755'] }}" - vars: - k8s_client_mount_path: "{{ hostvars['localhost']['k8s_client_share_path'] }}" - -- name: Display telemetry deployment script location - ansible.builtin.debug: - msg: "✓ Telemetry deployment script created at {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh" - -- name: Execute telemetry deployment script - ansible.builtin.shell: | - cd {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry - ./telemetry.sh - register: telemetry_deploy_result - changed_when: false - when: - - auto_deploy_telemetry | default(false) | bool - -- name: Display deployment result - ansible.builtin.debug: - msg: | - Telemetry deployment script executed. - To manually deploy telemetry stack, run: - {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh - when: - - auto_deploy_telemetry | default(false) | bool - - telemetry_deploy_result is defined - -- name: Instructions for manual deployment - ansible.builtin.debug: - msg: | - Telemetry manifests generated successfully. - To deploy the telemetry stack, run one of: - 1. {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh - 2. kubectl apply -k {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/ - when: - - not (auto_deploy_telemetry | default(false) | bool) diff --git a/provision/roles/telemetry/tasks/deploy_vector_ldms.yml b/provision/roles/telemetry/tasks/deploy_vector_ldms.yml index c5b0fc6866..ee7fbdedc1 100644 --- a/provision/roles/telemetry/tasks/deploy_vector_ldms.yml +++ b/provision/roles/telemetry/tasks/deploy_vector_ldms.yml @@ -31,10 +31,6 @@ mode: "{{ hostvars['localhost']['file_permissions_644'] }}" register: vector_ldms_configmap_rendered -- name: Display ConfigMap render result - ansible.builtin.debug: - msg: "✓ Vector-LDMS ConfigMap rendered to {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-configmap.yaml" - # ============================================================================ # Render Vector-LDMS Deployment # ============================================================================ @@ -45,10 +41,6 @@ mode: "{{ hostvars['localhost']['file_permissions_644'] }}" register: vector_ldms_deployment_rendered -- name: Display Deployment render result - ansible.builtin.debug: - msg: "✓ Vector-LDMS Deployment rendered to {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-deployment.yaml" - # ============================================================================ # Render Vector-LDMS Service # ============================================================================ @@ -58,7 +50,3 @@ dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-service.yaml" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" register: vector_ldms_service_rendered - -- name: Display Service render result - ansible.builtin.debug: - msg: "✓ Vector-LDMS Service rendered to {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector/vector-ldms-service.yaml" diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml index e248160f2f..0eb7db6bce 100644 --- a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml +++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml @@ -94,45 +94,26 @@ # If logs enabled, requires Kafka + VictoriaLogs # ============================================================================= -- name: Enable Kafka if Vector-LDMS bridge is enabled (requires LDMS source) +- name: Enable Kafka and VictoriaMetrics if Vector-LDMS bridge is enabled (requires LDMS source) ansible.builtin.set_fact: kafka_support: true - cacheable: true - when: - - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool - - ldms_support | default(false) | bool - -- name: Enable VictoriaMetrics if Vector-LDMS bridge is enabled (requires LDMS source) - ansible.builtin.set_fact: victoria_metrics_support: true cacheable: true when: - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool - ldms_support | default(false) | bool -- name: Enable Kafka if Vector-OME metrics bridge is enabled +- name: Enable Kafka and Victoria Metrics if Vector-OME metrics bridge is enabled ansible.builtin.set_fact: kafka_support: true - cacheable: true - when: - - telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool - -- name: Enable VictoriaMetrics if Vector-OME metrics bridge is enabled - ansible.builtin.set_fact: victoria_metrics_support: true cacheable: true when: - telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool -- name: Enable Kafka if Vector-OME logs bridge is enabled +- name: Enable Kafka and VictoriaLogs if Vector-OME logs bridge is enabled ansible.builtin.set_fact: kafka_support: true - cacheable: true - when: - - telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool - -- name: Enable VictoriaLogs if Vector-OME logs bridge is enabled - ansible.builtin.set_fact: victoria_logs_support: true cacheable: true when: diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index d704a5c2c4..0e64077904 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -88,7 +88,6 @@ - ldms_support - "kafka.topics.ldms.name in kafka_topic_partitions" - - name: Generate Kafka topic files dynamically ansible.builtin.template: src: 'telemetry/kafka/kafka.topic.yaml.j2' diff --git a/provision/roles/telemetry/tasks/generate_telemetry_script.yml b/provision/roles/telemetry/tasks/generate_telemetry_script.yml new file mode 100644 index 0000000000..1bd364ea7e --- /dev/null +++ b/provision/roles/telemetry/tasks/generate_telemetry_script.yml @@ -0,0 +1,34 @@ +--- +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Generate Telemetry Deployment Script +# Purpose: Create telemetry.sh script in shared directory for cloud-init execution +# This replaces the inline script generation in cloud-init templates +# +# The script is created at: {{ k8s_client_share_path }}/telemetry/telemetry.sh +# Cloud-init will execute this script during node provisioning to deploy telemetry stack + +- name: Create telemetry deployment script + ansible.builtin.template: + src: "{{ role_path }}/templates/telemetry/telemetry.sh.j2" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh" + mode: "{{ hostvars['localhost']['file_permissions_755'] }}" + vars: + k8s_client_mount_path: "{{ hostvars['localhost']['k8s_client_share_path'] }}" + +- name: Display telemetry deployment script location + ansible.builtin.debug: + msg: "Telemetry deployment script created at {{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/telemetry.sh" + verbosity: 1 diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index a0d1a08df5..d072be8cb7 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -77,6 +77,16 @@ - telemetry_deployment - vector_ldms +# - name: Deploy Vector-OME bridge (Kafka-to-Victoria pipeline for OME data) +# ansible.builtin.include_tasks: deploy_vector_ome.yml +# when: +# - telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool or +# telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool +# - kafka_support | default(false) | bool +# tags: +# - telemetry_deployment +# - vector_ome + - name: Configure of k8s telemetry service when: - telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool @@ -105,16 +115,16 @@ - ldms_support - pxe_changed | default(false) | bool -# - name: Deploy telemetry manifests (create telemetry.sh script) -# ansible.builtin.include_tasks: deploy_telemetry_manifests.yml -# when: -# - >- -# (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or -# (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or -# (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or -# ldms_support | default(false) | bool -# tags: -# - telemetry_deployment +- name: Generate telemetry deployment script + ansible.builtin.include_tasks: generate_telemetry_script.yml + when: + - >- + (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + ldms_support | default(false) | bool + tags: + - telemetry_deployment - name: Apply telemetry configurations on upgrade ansible.builtin.include_tasks: apply_telemetry_on_upgrade.yml diff --git a/provision/roles/telemetry/tasks/telemetry_prereq.yml b/provision/roles/telemetry/tasks/telemetry_prereq.yml index fa746b376a..039e66e472 100644 --- a/provision/roles/telemetry/tasks/telemetry_prereq.yml +++ b/provision/roles/telemetry/tasks/telemetry_prereq.yml @@ -144,13 +144,13 @@ mode: "{{ hostvars['localhost']['file_permissions_644'] }}" # Create Vector deployment subdirectory -# - name: Create Vector deployment subdirectory -# ansible.builtin.file: -# path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector" -# state: directory -# mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" -# when: -# - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled or -# telemetry_config.telemetry_bridges.vector_ome.metrics_enabled or -# telemetry_config.telemetry_bridges.vector_ome.log_enabled -# tags: telemetry_deployment +- name: Create Vector deployment subdirectory + ansible.builtin.file: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/vector" + state: directory + mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" + when: + - telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool or + telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool or + telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool + tags: telemetry_deployment diff --git a/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 b/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 similarity index 100% rename from provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 rename to provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 diff --git a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 index fe8f086c22..74fce2a773 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 @@ -75,3 +75,38 @@ data: cluster_endpoint: "{{ cluster.endpoint }}" {% endfor %} {% endif %} +{% if telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool %} + + # vmagent-vector write-buffer metrics (always deployed with Vector-LDMS) + - job_name: 'vmagent-vector' + kubernetes_sd_configs: + - role: pod + namespaces: + names: ['{{ telemetry_namespace }}'] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app] + regex: vmagent-vector + action: keep + - source_labels: [__meta_kubernetes_pod_name] + target_label: instance + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + + # Vector-LDMS internal metrics (port {{ vector.ldms.metrics_port }}) + - job_name: 'vector-ldms' + kubernetes_sd_configs: + - role: pod + namespaces: + names: ['{{ telemetry_namespace }}'] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app] + regex: {{ vector.ldms.app_name }} + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_number] + regex: "{{ vector.ldms.metrics_port }}" + action: keep + - source_labels: [__meta_kubernetes_pod_name] + target_label: instance + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace +{% endif %} From aff8edf0c6cb37f1c219bcd08e00a70a618f2c28 Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Mon, 4 May 2026 22:55:50 +0530 Subject: [PATCH 7/7] lint-fix --- provision/roles/telemetry/tasks/generate_telemetry_script.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provision/roles/telemetry/tasks/generate_telemetry_script.yml b/provision/roles/telemetry/tasks/generate_telemetry_script.yml index 1bd364ea7e..17db736c75 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_script.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_script.yml @@ -16,7 +16,7 @@ # Generate Telemetry Deployment Script # Purpose: Create telemetry.sh script in shared directory for cloud-init execution # This replaces the inline script generation in cloud-init templates -# +# # The script is created at: {{ k8s_client_share_path }}/telemetry/telemetry.sh # Cloud-init will execute this script during node provisioning to deploy telemetry stack