Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

async def disable_ztp(device):
await device.run_cmd(f"rm -f /etc/network/if-up.d/ntpdate || true")
for s in ["IhmInfraCommodityZTP", "snmpd"]:
for s in ["snmpd"]:
input_data = [{device.host_name: [{"name": s}]}]
out = await Service.stop(
input_data=input_data,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ async def do_trigger(testbed, trigger_obj):
elif trigger == TRIGGER_RESTART_SERVICES:
services = [
# "frr.service",
"IhmDentTcFlower",
# "networking",
]
for s in services:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,201 +16,6 @@
)


async def check_certificates(dev, devices_dict):
# certificates - should be refreshed every 10mins at /var/shared_resources/credentials/*.sts
cmd = "find /var/shared_resources/credentials/ -name '*.sts' -mmin -10 -exec ls {} \\;"
rc, out = await dev.run_cmd(cmd, sudo=True)
dev.applog.info(f"Ran {cmd} rc {rc} out {out}")
# there should be atleast two files
if rc or not out:
rc, out = await dev.run_cmd("date && ls -l /var/shared_resources/credentials/*.sts")
return False
dev.applog.info(f"Found {out} files")
return True


async def check_routes(dev, devices_dict):
# Check routing and access to internet - Traffic test - check for offload

out = await IpRoute.show(
input_data=[{dev.host_name: [{"cmd_options": "-j -d "}]}], parse_output=True
)
assert out[0][dev.host_name]["rc"] == 0, f"Failed to get routes on {dev.host_name}"
routes = out[0][dev.host_name]["parsed_output"]
for route in routes:
dst = route["dst"]
if "offload" in route["flags"]:
dev.applog.info(f"route {dst} offloaded")
continue
if "linkdown" in route["flags"]:
dev.applog.info(f"route {dst} linkdown")
continue
# add exceptions
dev.applog.info(f"checking route {dst} exceptions")
# no device
port = route.get("dev", "ma1")
# mgmt interfces
if port in ["ma1"]:
continue
if port.startswith("vlan") and port.endswith("-v0"):
continue
if dst in ["10.1.253.0/24", "10.2.0.222", "10.1.255.0/24", "10.2.96.44/30"]:
continue
dev.applog.info(f"route {route} not offloaded")
return False
return True


async def check_internet_connectivity(dev, devices_dict):
rc = await tb_ping_device(dev, "www.amazon.com")
if rc:
dev.applog.info(f"Failed to reach internet")
return False
return True


async def check_wan_failover(dev, devices_dict):
# WAN failover - do it per dist - turn off wan port and see reachability via other DIST - SWP10
if dev.type == DeviceType.DISTRIBUTION_ROUTER:
# TODO should we do this on dist also??
return True

if len(devices_dict[DeviceType.DISTRIBUTION_ROUTER]) < 2:
dev.applog.info(f"Not sufficient dist router to perform this test")
return True

# down swp10 the link on one of the DIST
dist = devices_dict[DeviceType.DISTRIBUTION_ROUTER][0]
out = await IpLink.set(
input_data=[{dist.host_name: [{"device": "swp10", "operstate": "up"}]}],
)
time.sleep(2)
out = await IpLink.show(
input_data=[{dist.host_name: [{"device": "swp10", "cmd_options": "-j"}]}],
parse_output=True,
)

assert out[0][dist.host_name]["rc"] == 0, f"Failed to get swp10 link info on {dist.host_name}"
links = out[0][dist.host_name]["parsed_output"]

if links[0]["operstate"] == "DOWN":
dev.applog.info("Link to the WAN is already down!!!")
return False

# bring down the link in background since the connection might be on this link
cmd = "(sleep 1; sudo ip link set swp10 down) &"
rc, out = await dist.run_cmd(cmd, sudo=True)
if rc:
dev.applog.info(f"Could do a link down on swp10 on {dist.host_name}")
return False
# disconnect and try now
devices = []
for d in devices_dict.values():
devices.extend(d)
await tb_reset_ssh_connections(devices)
time.sleep(10)
# check if we can reach the internet
ret = await check_internet_connectivity(dev, devices_dict)

out = await IpLink.set(
input_data=[{dist.host_name: [{"device": "swp10", "operstate": "up"}]}],
)
if out[0][dist.host_name]["rc"]:
dev.applog.info(f"Could not do a link up on swp10 on {dist.host_name}")
return False
time.sleep(10)
return ret


async def check_wan_to_lte_failver(dev, devices_dict):
# WAN failover - do it per dist - turn off wan port and see reachability via other DIST - SWP10
if dev.type == DeviceType.DISTRIBUTION_ROUTER:
# TODO should we do this on dist also??
return True

if len(devices_dict[DeviceType.DISTRIBUTION_ROUTER]) < 2:
dev.applog.info(f"Not sufficient dist router to perform this test")
return True

devices = []
for d in devices_dict.values():
devices.extend(d)

# down swp10 the link on one of the DIST
for dist in devices_dict[DeviceType.DISTRIBUTION_ROUTER]:
out = await IpLink.set(
input_data=[{dist.host_name: [{"device": "swp10", "operstate": "up"}]}],
)
time.sleep(2)
out = await IpLink.show(
input_data=[{dist.host_name: [{"device": "swp10", "cmd_options": "-j"}]}],
parse_output=True,
)
assert (
out[0][dist.host_name]["rc"] == 0
), f"Failed to get swp10 link info on {dist.host_name}"
links = out[0][dist.host_name]["parsed_output"]

if links[0]["operstate"] == "DOWN":
dev.applog.info("Link to the WAN is already down!!!")
return False

# bring down the link in background since the connection might be on this link
cmd = "(sleep 1; sudo ip link set swp10 down) &"
rc, out = await dist.run_cmd(cmd)
if rc:
dev.applog.info(f"Could not do a link down on swp10 on {dist.host_name}")
return False
# disconnect and try now
await tb_reset_ssh_connections(devices)

time.sleep(30)
await tb_reset_ssh_connections(devices)

# check if we can reach the internet
ret = await check_internet_connectivity(dev, devices_dict)

# bring em back up again.
for dist in devices_dict[DeviceType.DISTRIBUTION_ROUTER]:
# bring down the link in background since the connection might be on this link
cmd = "(sleep 1; sudo ip link set swp10 up) &"
rc, out = await dist.run_cmd(cmd)
if rc:
dev.applog.info(f"Could not do a link up on swp10 on {dist.host_name}")
return False
time.sleep(5)
return ret


async def check_bgp_sessions(dev, devices_dict):
# BGP sessions - exceptions due to missing aggs/dists/oobs.
# Otherwise all should be UP. Check for "state":"Established" - show ip bgp summary json
out = await Bgp.show(input_data=[{dev.host_name: [{"ip": "", "options": "json"}]}])
dev.applog.info(f"Ran Bgp.show out {out}")
rc = out[0][dev.host_name]["rc"]
assert rc == 0, f"Failed get bgp summary {rc} {out}"
bgp_summary = json.loads(out[0][dev.host_name]["result"])
for prefix, peer in bgp_summary["ipv4Unicast"]["peers"].items():
if peer["state"] == "Established":
dev.applog.info(f"{prefix} peer is in Established state")
continue
# handle exceptions
dev.applog.info(f"{prefix} peer is in not Established state. Checking for excptions")
if prefix in [
"10.2.96.130",
"10.2.96.134",
"10.2.97.173",
"10.2.96.173",
"10.2.96.41",
"10.2.96.45",
"10.2.96.117",
]:
continue
dev.applog.info(f"BGP peer {prefix} is supposed to be established {peer}")
return False
return True


async def check_services(dev, devices_dict):
"""
Processes list check:
Expand All @@ -221,17 +26,7 @@ async def check_services(dev, devices_dict):
services = []
services.append("auditd.service")
services.append("awslogs.service")
# services.append("bridge-interface.service")
services.append("frr.service")
services.append("IhmDentTcFlower.service")
# if provisioned then need more services
if dev.ssh_conn_params.pssh:
services.append("bridge-agent.service")
services.append("identity-agent.service")
services.append("IhmInfraSystemsDeviceProvisioning.service")
services.append("IhmNetworkDeviceHostMetricMonitoring.service")
services.append("IhmNetworkDeviceMetricAgent.service")
services.append("tennant.service")
services.append("inetd.service")
services.append("lldpd.service")
services.append("lm-sensors.service")
Expand All @@ -243,10 +38,6 @@ async def check_services(dev, devices_dict):
if dev.type == DeviceType.INFRA_SWITCH:
services.append("keepalived.service")
services.append("isc-dhcp-server.service")
rc, out = await dev.run_cmd("ls /sputnik/env/IhmDentPoe/bin/poectl")
if rc == 0:
dev.applog.info("Adding poe service to check")
services.append("IhmDentPoe.service")
if dev.type == DeviceType.OUT_OF_BOUND_SWITCH:
services.append("onie-dhcp.service")
try:
Expand Down Expand Up @@ -279,52 +70,13 @@ async def check_ntp_sync(dev, devices_dict):
return True


async def check_infra_to_infra_ping(dev, devices_dict):
"""
- Check infra to infra pings over vlan100 - 10.1.4.3 on infra1 and 10.1.4.2 on infra2.
"""
if dev.type is not DeviceType.INFRA_SWITCH:
return True

for infra in devices_dict[DeviceType.INFRA_SWITCH]:
# no need to ping self
if infra.host_name == dev.host_name:
continue

# do a poing to this device
# get the ip
out = await IpAddress.show(
input_data=[{infra.host_name: [{"dev": "vlan100", "cmd_options": "-j"}]}],
parse_output=True,
)
assert (
out[0][infra.host_name]["rc"] == 0
), f"Failed to get ip address for vlan100 on {infra.host_name}"

addresses = out[0][infra.host_name]["parsed_output"]
infra_ip = None
for addr in addresses[0]["addr_info"]:
if addr["family"] == "inet" and addr["scope"] == "global":
infra_ip = addr["local"]

if infra_ip is None:
dev.applog.info(f"Could not get IP address for vlan100 to {infra.host_name}")

rc = await tb_ping_device(dev, f"{infra_ip}", dump=True)
if rc != 0:
dev.applog.info(f"Failed to reach {infra.host_name} {rc}")
return False

return True


async def check_poe_devices(dev, devices_dict):
"""
- check if the poectl works.
"""
if dev.type is not DeviceType.INFRA_SWITCH:
return True
rc, out = await dev.run_cmd("ls /sputnik/env/IhmDentPoe/bin/poectl")
rc, out = await dev.run_cmd("which poecli")
if rc != 0:
return True
dev.applog.info("Checking for poectl Health")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ async def tb_device_check_services(dev, prev_state, check, healthy_services=None
"ssh.service",
"systemd-udev-trigger.service",
"systemd-udevd.service",
"IhmDentTcFlower.service",
]
if healthy_services is None:
healthy_services = default_healthy_services
Expand Down Expand Up @@ -351,7 +350,6 @@ async def tb_device_reload_firewall(device):
"systemctl restart frr.service",
"iptables -F",
"tc filter delete block 1 ingress",
"systemctl restart IhmDentTcFlower.service",
]:
await device.run_cmd(cmd, sudo=True)

Expand Down