Skip to content

Commit

Permalink
Record ping stats, fix arp lookup, switch dashboard and dynamic links
Browse files Browse the repository at this point in the history
  • Loading branch information
juztas committed Jun 24, 2024
1 parent 455e0b4 commit 26a87b4
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 30 deletions.
4 changes: 2 additions & 2 deletions autogole-api/packaging/files/etc/rtmon.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ sense_endpoints:
template_links:
- title: 'All Node Monitoring'
url: 'https://autogole-grafana.nrp-nautilus.io/d/D7xOxim4z/full-dtn-monitoring-variable?orgId=1&refresh=1m'
- title: 'All Switches Monitoring'
url: 'https://autogole-grafana.nrp-nautilus.io/d/1J9Zz1mWz/full-switch-monitoring-variable?orgId=1&refresh=1m'
- title: 'All Switches Monitoring $$REPLACEMESITENAME$$'
url: 'https://autogole-grafana.nrp-nautilus.io/d/efe9ac84-9df5-47a4-bea8-67a196771a0d/switch-monitoring?orgId=1&refresh=1m&var-Sitename=$$REPLACEMESITENAME$$&var-switch=All'

# Override URL for the NSI,ESnet,Fabric mermaid diagrams. Most of those RMs report everything in a single SwitchingSubnet
# and we need to override it to show the actual topology (joint, or not). Additionally - most of those have no site name or
Expand Down
13 changes: 9 additions & 4 deletions autogole-api/src/python/RTMon/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,14 @@ def submit_exe(self, filename, fout):
template['folderId'] = folderInfo['id']
template['overwrite'] = True
self.g_addNewDashboard(template)
# 5. Update State to Running
# 5. Submit SiteRM Action to issue a ping test both ways
tmpOut = self.sr_submit_ping(instance=instance, manifest=manifest)
if tmpOut:
fout['ping'] = tmpOut
# 6. Update State to Running
fout['state'] = 'running'
fout.setdefault('retries', 0)
self._updateState(filename, fout)
# 6. Submit SiteRM Action to issue a ping test both ways
self.sr_submit_ping(instance=instance, manifest=manifest)

def delete_exe(self, filename, fout):
"""Delete Action Execution"""
Expand Down Expand Up @@ -103,7 +105,10 @@ def running_exe(self, filename, fout):
if self.config['template_tag'] in dashbVals['tags']:
self.logger.info('Dashboard is present in Grafana: %s', dashbName)
# Check if we need to re-issue ping test
self.sr_submit_ping(instance=fout.get('instance', {}), manifest=fout.get('manifest', {}))
tmpOut = self.sr_submit_ping(instance=fout.get('instance', {}), manifest=fout.get('manifest', {}))
if tmpOut:
fout['ping'] = tmpOut
self._updateState(filename, fout)
return
# Need to update the dashboard with new template_tag
self.logger.info('Dashboard is present in Grafana, but with old version: %s', dashbName)
Expand Down
4 changes: 4 additions & 0 deletions autogole-api/src/python/RTMonLibs/GrafanaAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
class GrafanaAPI():
"""Autogole SENSE Grafana RTMon API"""
def __init__(self, **kwargs):
# pylint: disable=E1123
# Grafana lib does support timeout, but pylint does not know it.
super().__init__(**kwargs)
self.config = kwargs.get('config')
self.logger = kwargs.get('logger')
Expand All @@ -20,6 +22,8 @@ def __init__(self, **kwargs):

def g_loadAll(self):
"""Load all Dashboards, Alerts, Folders"""
# pylint: disable=E1123
# Grafana lib does support timeout, but pylint does not know it.
self.grafanaapi = GrafanaApi.from_url(
url=self.config['grafana_host'],
credential=self.config['grafana_api_key'],
Expand Down
9 changes: 8 additions & 1 deletion autogole-api/src/python/RTMonLibs/SiteOverride.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,14 @@ def _so_getjointname(self, indata, override):
"""Get joint name"""
sitename = self.override[override]["name"]
tmpPort = indata["Port"].replace(override, '').strip(':').split(':')
return sitename, f"{tmpPort[0]}_{tmpPort[1]}"
nname = ""
for item in tmpPort:
if item != "+":
if nname:
nname = f"{nname}|{item}"
else:
nname = item
return sitename, nname

def so_override(self, indata):
"""Override site specific settings"""
Expand Down
9 changes: 8 additions & 1 deletion autogole-api/src/python/RTMonLibs/SiteRMApi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
Class for interacting with SENSE SiteRMs
"""
import time
from RTMonLibs.GeneralLibs import loadJson
from sense.client.siterm.debug_api import DebugApi

Expand Down Expand Up @@ -55,6 +56,7 @@ def sr_submit_ping(self, **kwargs):
self.logger.info("Start check for ping test if needed")
hosts, allIPs = self._sr_get_all_hosts(**kwargs)
# based on our variables;
ping_out = []
for host in hosts:
# Check if IPv6 or IPv4 is defined
for key, defval in [("IPv4", "?ipv4?"), ("IPv6", "?ipv6?")]:
Expand Down Expand Up @@ -82,4 +84,9 @@ def sr_submit_ping(self, **kwargs):
break
if not actionPresent:
self.logger.info(f"Submitting ping test for {newaction}")
self.siterm_debug.submit_ping(**newaction)
out = self.siterm_debug.submit_ping(**newaction)
newaction['submit_time'] = int(time.time())
newaction['submit_out'] = out[0]
self.logger.info(f"Submitted ping test for {newaction}: {out}")
ping_out.append(newaction)
return ping_out
60 changes: 38 additions & 22 deletions autogole-api/src/python/RTMonLibs/Template.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
# pylint: disable=E1101
"""Grafana Template Generation"""
import copy
import os.path
Expand Down Expand Up @@ -251,6 +252,14 @@ def __init__(self, **kwargs):
self.nextid = 0
self.gridPos = {"x": 0, "y": 0, "w": 24, "h": 8}

def __getTitlesUrls(self, site, link):
"""Get Titles and URLs"""
title = link.get('title', "Link-Title-Not-Present-in-Config")
url = link.get('url', "https://link-not-present-in-config")
title = title.replace("$$REPLACEMESITENAME$$", site)
url = url.replace("$$REPLACEMESITENAME$$", site)
return title, url

def _getNextID(self):
"""Get Next ID"""
self.nextid += 1
Expand Down Expand Up @@ -279,7 +288,7 @@ def addRowPanel(self, row, panels):
out.append(row)
return out

def _t_getDataSourceUid(self, *args):
def _t_getDataSourceUid(self, *_args):
"""Get Data Source UID"""
# TODO: We need a way for Orchestrator to tell us if this is real time (5sec resolution)
# or historical (1min resolution)
Expand All @@ -294,7 +303,7 @@ def _t_loadTemplate(self, templateName):
template = loadJson(fd.read(), self.logger)
return template

def t_addRow(self, *args, **kwargs):
def t_addRow(self, *_args, **kwargs):
"""Add Row to the Dashboard"""
out = self._t_loadTemplate("row.json")
out["title"] = kwargs.get('title', "Row Title Not Present")
Expand All @@ -307,12 +316,6 @@ def t_addLinks(self, *_args):
return []
ret = []
out = self._t_loadTemplate("links.json")
for link in self.config['template_links']:
tmpcopy = copy.deepcopy(out)
tmpcopy["title"] = link.get('title', "Link-Title-Not-Present-in-Config")
tmpcopy["url"] = link.get('url', "https://link-not-present-in-config")
ret.append(tmpcopy)
# Add all Monitoring links to SiteRM/NetworkRM
sites = []
# First need to identify all sites (only uniques, as it can repeat)
for sitehost, _interfaces in self.m_groups['Hosts'].items():
Expand All @@ -323,12 +326,23 @@ def t_addLinks(self, *_args):
sitename = sitehost.split(":")[0]
if sitename in self.dashboards and sitename not in sites:
sites.append(sitename)
# For all sites - add the monitoring link

# Add dynamic urls from configuration and replace sitename with site
addedUrls = []
for site in sites:
tmpcopy = copy.deepcopy(out)
tmpcopy["title"] = f"Site Monitoring: {site}"
tmpcopy["url"] = f"{self.config['grafana_host']}/d/{self.dashboards[site]['uid']}"
ret.append(tmpcopy)
for link in self.config['template_links']:
title, url = self.__getTitlesUrls(site, link)
if url not in addedUrls:
tmpcopy = copy.deepcopy(out)
tmpcopy["title"] = title
tmpcopy["url"] = url
print(tmpcopy)
ret.append(tmpcopy)
addedUrls.append(url)
return ret


Expand Down Expand Up @@ -382,7 +396,7 @@ def t_createSwitchFlow(self, *args):
self.logger.error(f"Sitehost: {sitehost}")
self.logger.error(f"Interfaces: {interfaces}")
self.logger.error("This happens for Sites/Switches not exposing correct Sitename/Port. Are you missing an override?")
raise Exception("Sitehost not in correct format")
raise Exception("Sitehost not in correct format") from ex
sitename = sitehost.split(":")[0]
hostname = sitehost.split(":")[1]
intfline = "|".join(interfaces.keys())
Expand All @@ -405,8 +419,8 @@ def t_createMermaid(self, *args):
mermaid = self.m_getMermaidContent(args[1])
panel["options"]["content"] = "\n".join(mermaid)
# Need to add correct size for the panel
totalHeight = 18 + (len(self.m_groups['Hosts'])*2) + (len(self.m_groups['Switches'])*2)
panel["gridPos"]["h"] = clamp(totalHeight, 18, 48)
totalHeight = 12 + len(self.m_groups['Hosts']) + len(self.m_groups['Switches'])
panel["gridPos"]["h"] = clamp(totalHeight, 14, 24)
return self.addRowPanel(row, [panel])

def t_addDebug(self, *args):
Expand Down Expand Up @@ -465,15 +479,17 @@ def _t_addHostL2Debugging(self, sitehost, interfaces, refid):
query['refId'] = f'A{refid}'
refid += 1
queries.append(query)
for mhost, macaddr in self.mac_addresses.items():
if mhost != hostname:
query = copy.deepcopy(origin_query)
query['datasource']['uid'] = str(self.t_dsourceuid)
query['expr'] = f'sum(arp_state{{HWaddress=~"{macaddr}.*"}}) OR on() vector(0)'
query['legendFormat'] = f'MAC address of {mhost} end visible in arptable'
query['refId'] = f'A{refid}'
refid += 1
queries.append(query)
if 'Vlan' in intfdata and intfdata['Vlan']:
vlan = intfdata['Vlan']
for mhost, macaddr in self.mac_addresses.items():
if mhost != hostname:
query = copy.deepcopy(origin_query)
query['datasource']['uid'] = str(self.t_dsourceuid)
query['expr'] = f'sum(arp_state{{HWaddress=~"{macaddr}.*",Hostname="{hostname}",sitename="{sitename}",Device="vlan.{vlan}"}}) OR on() vector(0)'
query['legendFormat'] = f'MAC address of {mhost} end visible in arptable under vlan.{vlan}'
query['refId'] = f'A{refid}'
refid += 1
queries.append(query)
panel = self._t_loadTemplate("l2state.json")
panel['id'] = self._getNextID()
panel['title'] = f"L2 Debugging for Host: {sitehost}"
Expand Down Expand Up @@ -506,7 +522,7 @@ def _t_addSwitchL2Debugging(self, sitehost, interfaces, refid):
for mhost, macaddr in self.mac_addresses.items():
query = copy.deepcopy(origin_query)
query['datasource']['uid'] = str(self.t_dsourceuid)
query['expr'] = f'sum(mac_table_info{{hostname="{hostname}", macaddress="{macaddr}", vlan="{vlan}"}}) OR on() vector(0)'
query['expr'] = f'sum(mac_table_info{{sitename="{sitename}",hostname="{hostname}", macaddress="{macaddr}", vlan="{vlan}"}}) OR on() vector(0)'
query['legendFormat'] = f'MAC address of {mhost} visible in mac table ({vlan})'
query['refId'] = f'A{refid}'
refid += 1
Expand Down

0 comments on commit 26a87b4

Please sign in to comment.