diff --git a/ansible/install/group_vars/all.yml b/ansible/install/group_vars/all.yml index bcf23db0..593517cf 100644 --- a/ansible/install/group_vars/all.yml +++ b/ansible/install/group_vars/all.yml @@ -333,6 +333,13 @@ ovs_flows_monitoring: false # before enabling this plugin. ovn_monitoring: false +####################### +# OVN RAFT Monitoring +####################### +# Monitors OVN RAFT cluster status metrics on controllers. +ovn_raft_monitoring: false +ovn_raft_controller_collectd_interval: 30 + ####################### # Pacemaker Monitoring ####################### diff --git a/ansible/install/roles/collectd/tasks/main.yml b/ansible/install/roles/collectd/tasks/main.yml index e6e4f371..176730d2 100644 --- a/ansible/install/roles/collectd/tasks/main.yml +++ b/ansible/install/roles/collectd/tasks/main.yml @@ -259,6 +259,10 @@ -v /var/lib/openvswitch/ovn/ovnnb_db.sock:/var/lib/openvswitch/ovn/ovnnb_db.sock \ -v /var/lib/openvswitch/ovn/ovnsb_db.sock:/var/lib/openvswitch/ovn/ovnsb_db.sock \ {% endif %} + {% if ovn_raft_monitoring %} + -v /var/lib/openvswitch/ovn/ovnnb_db.ctl:/var/lib/openvswitch/ovn/ovnnb_db.ctl \ + -v /var/lib/openvswitch/ovn/ovnsb_db.ctl:/var/lib/openvswitch/ovn/ovnsb_db.ctl \ + {% endif %} {% if pacemaker_monitoring %} -v /home/{{ host_remote_user }}/collectd_pipe:/collectd_pipe \ {% endif %} @@ -277,4 +281,4 @@ podman exec -it -u root collectd-controller usermod -G wheel stack become: yes become_user: root - when: "config_type == 'controller' and ovn_monitoring" + when: "config_type == 'controller' and (ovn_monitoring or ovn_raft_monitoring)" diff --git a/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 b/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 index 0918d559..9de8a4be 100644 --- a/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 +++ b/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 @@ -585,6 +585,17 @@ LoadPlugin python {% endif %} +{%if ovn_raft_monitoring %} + + ModulePath "/usr/local/bin/" + Import "collectd_ovn_raft_monitoring" + + Interval {{ovn_raft_controller_collectd_interval}} + + + +{% endif %} + {%if gnocchi_status_controller_collectd_plugin %} {%if inventory_hostname == groups['Controller'][0] %} diff --git a/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 b/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 index e64ed683..c117a676 100644 --- a/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 +++ b/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 @@ -169,6 +169,8 @@ dashboard: {% include 'partials/ovn_db_tables.yaml' %} {% include 'partials/pacemaker_monitoring.yaml' %} + + {% include 'partials/ovn_raft_monitoring.yaml' %} {% endif %} {% include 'partials/ovn_metrics.yaml' %} diff --git a/ansible/install/roles/grafana-dashboards/templates/partials/ovn_raft_monitoring.yaml b/ansible/install/roles/grafana-dashboards/templates/partials/ovn_raft_monitoring.yaml new file mode 100644 index 00000000..56963d11 --- /dev/null +++ b/ansible/install/roles/grafana-dashboards/templates/partials/ovn_raft_monitoring.yaml @@ -0,0 +1,73 @@ + - title: OVN RAFT Metrics + collapse: true + height: 200px + showTitle: true + panels: + - title: $Cloud - $Node - OVN RAFT Northbound DB Metrics + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_disconnections, 'disconnections') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_election_timer, 'election_timer') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_entries_not_applied, 'entries_not_yet_applied') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_entries_not_committed, 'entries_not_yet_committed') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_term, 'term') + - title: $Cloud - $Node - OVN RAFT Northbound DB Leader Data + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_is_leader, 'is_selected_controller_leader') + - title: $Cloud - $Node - OVN RAFT Southbound DB Metrics + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_disconnections, 'disconnections') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_election_timer, 'election_timer') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_entries_not_applied, 'entries_not_yet_applied') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_entries_not_committed, 'entries_not_yet_committed') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_term, 'term') + - title: $Cloud - $Node - OVN RAFT Southbound DB Leader Data + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_is_leader, 'is_selected_controller_leader') diff --git a/browbeat-containers/collectd-openstack/Dockerfile b/browbeat-containers/collectd-openstack/Dockerfile index 7da30095..c082a95a 100644 --- a/browbeat-containers/collectd-openstack/Dockerfile +++ b/browbeat-containers/collectd-openstack/Dockerfile @@ -30,6 +30,7 @@ ADD files/collectd_rabbitmq_monitoring.py /usr/local/bin/collectd_rabbitmq_monit ADD files/collectd_swift_stat.py /usr/local/bin/collectd_swift_stat.py ADD files/collectd_pacemaker_monitoring.py /usr/local/bin/collectd_pacemaker_monitoring.py ADD files/collectd_iostat_python.py /usr/local/bin/collectd_iostat_python.py +ADD files/collectd_ovn_raft_monitoring.py /usr/local/bin/collectd_ovn_raft_monitoring.py ADD files/ovs_flows.sh /usr/local/bin/ovs_flows.sh ADD files/ovn_monitoring.sh /usr/local/bin/ovn_monitoring.sh diff --git a/browbeat-containers/collectd-openstack/files/collectd_ovn_raft_monitoring.py b/browbeat-containers/collectd-openstack/files/collectd_ovn_raft_monitoring.py new file mode 100644 index 00000000..f876c207 --- /dev/null +++ b/browbeat-containers/collectd-openstack/files/collectd_ovn_raft_monitoring.py @@ -0,0 +1,88 @@ +import collectd +import subprocess + +INTERVAL = 30 + +def config_func(config): + for node in config.children: + key = node.key.lower() + + if key == 'interval': + global INTERVAL + INTERVAL = int(node.values[0]) + +def read_func(): + global INTERVAL + + nbdb_process = subprocess.Popen(['sudo','ovs-appctl', '-t', '/var/lib/openvswitch/ovn/ovnnb_db.ctl', + 'cluster/status', 'OVN_Northbound'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + nbdb_out, nbdb_err = nbdb_process.communicate() + nbdb_out = nbdb_out.decode("utf-8") + + sbdb_process = subprocess.Popen(['sudo', 'ovs-appctl', '-t', '/var/lib/openvswitch/ovn/ovnsb_db.ctl', + 'cluster/status', 'OVN_Southbound'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + sbdb_out, sbdb_err = sbdb_process.communicate() + sbdb_out = sbdb_out.decode("utf-8") + + components_list = ["nb_term", "nb_election_timer", "nb_entries_not_committed", + "nb_entries_not_applied", "nb_disconnections", "nb_is_leader", + "sb_term", "sb_election_timer", "sb_entries_not_committed", + "sb_entries_not_applied", "sb_disconnections", "sb_is_leader"] + + for component in components_list: + val = 0 + + if "nb" in component: + for line in nbdb_out.split("\n"): + if component == "nb_term" and "Term:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_election_timer" and "Election timer:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_entries_not_committed" and "Entries not yet committed:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_entries_not_applied" and "Entries not yet applied:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_disconnections" and "Disconnections:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_is_leader" and "Role: leader" in line: + val = 1 + break + + elif "sb" in component: + for line in sbdb_out.split("\n"): + if component == "sb_term" and "Term:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_election_timer" and "Election timer:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_entries_not_committed" and "Entries not yet committed:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_entries_not_applied" and "Entries not yet applied:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_disconnections" and "Disconnections:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_is_leader" and "Role: leader" in line: + val = 1 + break + + metric = collectd.Values() + metric.plugin = 'ovn_raft_monitoring' + metric.interval = INTERVAL + metric.type = 'gauge' + metric.type_instance = component + metric.values = [val] + metric.dispatch() + +collectd.register_config(config_func) +collectd.register_read(read_func)