# Testing LB on FABRIC U280

This notebook helps sets up a sender and multiple receiver nodes on FABRIC such that they can communicate with a LB provisioned in FABRIC. Unlike the 'FABRIC-live-lb-tester' notebook, this one does NOT use FABNetv4Ext service - only FABNetv4, consequently the security measures are more limited, since none of this is exposed to the internet. One of the nodes is designated as a sender and the rest as worker-receivers.

See the following diagram:

<div>
    <img src="figs/live-lb.png" width=500>
</div>


## Preamble

This code should *always* be executed regardless of whether you are starting a new slice or returning to an existing slice.

In [1]:
#
# EDIT THIS
#

# GitHub SSH key file (private) registered using the GitHubSSH.ipynb notebook referenced above
github_key = '/home/fabric/work/fabric_confi/github_ecdsa'

# Note for best management network IPv4 connectivity pick from
# 'UCSD', 'SRI', 'FIU' or 'TOKY' - these sites have
# IPv4. Other sites use IPv6 management and have trouble
# retrieving git-lfs artifacts.

# site_list_override = None

# if you want to force a site list instead of using random
#site_list_override = ['SRI', 'UCSD', 'CLEM']

# (super)core sites - should be low loss
site_list_override = ['KANS', 'NEWY', 'WASH', 'LOSA', 'DALL', 'ATLA'] #remove SALT temporarily


# high capacity sites (may have losses at high bandwidth)
# site_list_override = ['STAR', 'INDI', 'NCSA', 'TACC', 'UCSD', 'PSC']

# these we always exclude
site_exclude_list = ['EDUKY', 'EDC']

# how many workers do we want? (in addition to one sender)
number_of_workers = 1

# base distro 'ubuntu2[012]' or 'rocky[89]'
distro_name = 'ubuntu22'
distro_version = distro_name[-2:]

# map from distro to image name
images = {
    'ubuntu20': 'default_ubuntu_20',
    'ununtu21': 'default_ubuntu_21',
    'ubuntu22': 'default_ubuntu_22',
    'rocky8': 'default_rocky_8',
    'rocky9': 'default_rocky_9',
}

# note that the below is distribution specific ('ubuntu' for ubuntu and so on)
home_location = {
    'ubunt': '/home/ubuntu',
    'rocky' : '/home/rocky'
}[distro_name[:5]]

vm_key_location = f'{home_location}/.ssh/github_ecdsa'

# worker dimensions
node_attribs = {
    'cores': 8,
    'disk': 100,
    'ram': 24,
    'image': images[distro_name]
}

# slice name
slice_name = f'{number_of_workers + 1}-node U280 LB Tester Slice using {distro_name} 1'

# url of e2sar deps. Find the appropriate version for the OS at https://github.com/JeffersonLab/E2SAR/releases
e2sar_branch = "main"
static_release_url = 'https://github.com/JeffersonLab/E2SAR/releases/download/' # don't need to change this
e2sar_dep_artifcat = 'e2sar-deps_0.1.4_amd64.deb'
e2sar_release_ver = 'E2SAR-main-0.1.4'
e2sar_dep_url = static_release_url + e2sar_release_ver + "-" + distro_name[:-2] + "-" + distro_version + ".04/" + e2sar_dep_artifcat

# this is the IPv4 address and Control Plane listening port of the LB node created by the other notebook
lb_node_ip = "10.138.47.254"
lb_cp_port = 18008
# the admin token is also generated by the U280 LB notebook
lb_admin_token = "0DTIKKNEYMMOUP283AP8UM4ABI45PNJ3"

ejfat_admin_uri = f"ejfats://{lb_admin_token}@{lb_node_ip}:{lb_cp_port}/"

#
# SHOULDN'T NEED TO EDIT BELOW
#
# Preamble
import json
import time
from datetime import datetime
from datetime import timezone
from datetime import timedelta

from fabrictestbed_extensions.fablib.fablib import FablibManager as fablib_manager

from ipaddress import ip_address, IPv4Address, IPv6Address, IPv4Network, IPv6Network
import ipaddress

fablib = fablib_manager()             
fablib.show_config();

# gets prepended to site name - this network is per site
net_name_prefix = 'fabnetv4'

# this is the NIC to use
nic_model = 'NIC_Basic'

def execute_single_node(node, commands):
    for command in commands:
        print(f'\tExecuting "{command}" on node {node.get_name()}')
        #stdout, stderr = node.execute(command, quiet=True, output_file=node.get_name() + '_install.log')
        stdout, stderr = node.execute(command)
    if not stderr and len(stderr) > 0:
        print(f'Error encountered with "{command}": {stderr}')
        
def execute_commands(node, commands):
    if isinstance(node, list):
        for n in node:
            execute_single_node(n, commands)
    else:
        execute_single_node(node, commands)

def execute_single_node_on_thread(node, commands):
    # concatenate the commands using ';' and execute
    allcommands = ';'.join(commands)
    node.execute_thread(allcommands, output_file=node.get_name() + '_thread.log')

def execute_commands_on_threads(node, commands):
    if isinstance(node, list):
        for n in node:
            execute_single_node_on_thread(n, commands)
    else:
        execute_single_node_on_thread(node, commands)

def make_node_name(site_name, node_idx):
    return '_'.join([f"Worker{node_idx}", site_name])

def make_net_name(site_name):
    return '_'.join([net_name_prefix, site_name])

# return slice with one node on one site
def starter_slice(site_name):
    #node_name = make_node_name(site_name, 1)
    node_name = '_'.join(["Sender", site_name])
    net_name = make_net_name(site_name)

    slice = fablib.new_slice(name=slice_name)
    node = slice.add_node(name=node_name, site=site_name, **node_attribs)

    # postboot configuration is under 'post-boot' directory
    node.add_post_boot_upload_directory('post-boot','.')
    node.add_post_boot_execute(f'chmod +x post-boot/sender.sh && ./post-boot/sender.sh')
    
    # attach to network
    nic_interface = node.add_component(model=nic_model, name='_'.join([node_name, nic_model, 'nic'])).get_interfaces()[0]
    net = slice.add_l3network(name=net_name, interfaces=[nic_interface], type='IPv4')

    return slice

def add_node_to_slice(site_name, node_idx, inc, slice):

    net_name = make_net_name(site_name)

    while inc > 0:
        node_name = make_node_name(site_name, node_idx)
        node_idx += 1
        
        node = slice.add_node(name=node_name, site=site_name, **node_attribs)
    
        # postboot configuration is under 'post-boot' directory
        node.add_post_boot_upload_directory('post-boot','.')
        node.add_post_boot_execute(f'chmod +x post-boot/recver.sh && ./post-boot/recver.sh')
    
        nic_interface = node.add_component(model=nic_model, name='_'.join([node_name, nic_model, 'nic'])).get_interfaces()[0]
        
        # attach to a network, create network if needed
        net = slice.get_network(name=net_name)
        if net is None:
            net = slice.add_l3network(name=net_name, type='IPv4')
            
        net.add_interface(nic_interface)
        inc -= 1

    return None

def check_modify(slice, selected_site_list, nodes_in_slice, expected_to_add):

    success = True
    idx = 1
    while(expected_to_add >= idx):
        # find sliver reservation for new node
        node_sliver = slice.list_slivers(fields=['name', 'state'], 
                                         filter_function=lambda x: x['type'] == 'node' and 
                                             x['name'] == make_node_name(selected_site_list[0], nodes_in_slice + idx) and 
                                             x['state'] == 'Active')
        # if it is none - it failed
        if node_sliver is None:
            success = False
            break
        else:
            idx += 1

    return success

# until fablib fixes this
def get_management_os_interface(node) -> str or None:
        """
        Gets the name of the management interface used by the node's
        operating system. 

        :return: interface name
        :rtype: String
        """
        stdout, stderr = node.execute("sudo ip -j route list", quiet=True)
        stdout_json = json.loads(stdout)

        for i in stdout_json:
            if i["dst"] == "default":
                return i["dev"]

        stdout, stderr = node.execute("sudo ip -6 -j route list", quiet=True)
        stdout_json = json.loads(stdout)

        for i in stdout_json:
            if i["dst"] == "default":
                return i["dev"]

        return None

0,1
Log Level,INFO
Log File,/tmp/fablib/fablib.log
Data directory,/tmp/fablib
SSH Command Line,ssh -i {{ _self_.private_ssh_key_file }} -F /home/vscode/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }}
Orchestrator,orchestrator.fabric-testbed.net
Credential Manager,cm.fabric-testbed.net
Core API,uis.fabric-testbed.net
Artifact Manager,artifacts.fabric-testbed.net
Token File,/home/vscode/.tokens.json
Project ID,bbe0d94c-736b-477a-a2e6-fef9fe7ac9ca


## Helpers

If you ever forget which images are available, run this cell:

In [2]:
# List available images (this step is optional)
available_images = fablib.get_image_names()

print(f'Available images are: {available_images}')

Available images are: ['default_centos8_stream', 'default_centos9_stream', 'default_centos_7', 'default_centos_8', 'default_debian_11', 'default_debian_12', 'default_fedora_39', 'default_fedora_40', 'default_freebsd_13_zfs', 'default_freebsd_14_zfs', 'default_kali', 'default_openbsd_7', 'default_rocky_8', 'default_rocky_9', 'default_ubuntu_20', 'default_ubuntu_22', 'default_ubuntu_24', 'docker_rocky_8', 'docker_rocky_9', 'docker_ubuntu_20', 'docker_ubuntu_22']


## Prepare to create a new slice (skip if exists)

In [3]:
# list all slices I have running
output_dataframe = fablib.list_slices(output='pandas')
if output_dataframe:
    print(output_dataframe)
else:
    print('No active slices under this project')

No active slices under this project


In [4]:
# Identify sites in continental US we want to use (NOOP if override is set)
lon_west=-124.3993243
lon_east=-69.9721573
candidate_sites = 7
free_nodes_worth = 3 # how many nodes worth are we looking per site

# get a list of random sites, avoiding thos on the exclude list
# unless there is an override
if site_list_override is None:
    selected_site_list = fablib.get_random_sites(count=candidate_sites, avoid=site_exclude_list,
                                            filter_function=lambda x: x['location'][1] < lon_east
                                            and x['location'][1] > lon_west 
                                            and x['cores_available'] > free_nodes_worth * node_attribs['cores']
                                            and x['ram_available'] > free_nodes_worth * node_attribs['ram'] 
                                            and x['disk_available'] > free_nodes_worth * node_attribs['disk']) 
else:
    selected_site_list = site_list_override

if selected_site_list:
    print(f'Selected sites are {selected_site_list}')
else:
    print('Unable to find a sites matching the requirements')


Selected sites are ['KANS', 'NEWY', 'WASH', 'LOSA', 'DALL', 'ATLA']


## Create slice iteratively (skip if exists)

We may or may not get all the nodes we want immediately - we use iteration with slice modify to get to the max/desired number of nodes across the selected sites.

### Create Starter Slice

In [5]:
# we start by establishing a slice with one sender node at some site, we keep track which sites we failed 
# and don't try those again

keep_trying = True
succeeded = False

site_list_iter = iter(selected_site_list)
failed_sites = {}
site_name = None

while keep_trying:

    try:
        site_name = next(site_list_iter)
        print(f'Trying site {site_name} from {selected_site_list}')
        
        # define a starter slice
        slice = starter_slice(site_name)

        print(f'Submitting starter slice "{slice_name}" with sender on site {site_name}')
        slice_id = slice.submit()

        # check the state of this slice
        slices = fablib.get_slices(excludes=[], slice_id=slice_id)
        if slices[0].get_state() == 'Dead':
            print(f'Failed on site {site_name}, proceeding')
        else:
            print(f'Succeeded on site {site_name} with state {slices[0].get_state()}')
            keep_trying = False
            succeeded = True
    except StopIteration: 
        print('No more sites to look at, exiting')
        keep_trying = False
    except Exception as e:
        print(f'Unexpected exception {e}, exiting')
        keep_trying = False

if succeeded:
    print(f'Succeeded in creating a slice on {site_name}, will avoid sites {failed_sites}')
    selected_site_list = list(filter(lambda x: x not in failed_sites, selected_site_list))
    print(f'Proceeding with sites {selected_site_list}')


Retry: 7, Time: 183 sec


0,1
ID,7f21d307-d4a1-4a85-9841-74d8c4196f4d
Name,2-node U280 LB Tester Slice using ubuntu22 1
Lease Expiration (UTC),2025-02-28 05:24:40 +0000
Lease Start (UTC),2025-02-27 05:24:40 +0000
Project ID,bbe0d94c-736b-477a-a2e6-fef9fe7ac9ca
State,StableOK


ID,Name,Cores,RAM,Disk,Image,Image Type,Host,Site,Username,Management IP,State,Error,SSH Command,Public SSH Key File,Private SSH Key File
22d67f3b-6897-4456-9bfe-a87d4e2ec98a,Sender_KANS,8,32,100,default_ubuntu_22,qcow2,kans-w2.fabric-testbed.net,KANS,ubuntu,2001:400:a100:3060:f816:3eff:fe37:d475,Active,,ssh -i /home/vscode/.ssh/fabric_local/silver -F /home/vscode/work/fabric_config/ssh_config ubuntu@2001:400:a100:3060:f816:3eff:fe37:d475,/home/vscode/.ssh/fabric_local/silver.pub,/home/vscode/.ssh/fabric_local/silver


ID,Name,Layer,Type,Site,Subnet,Gateway,State,Error
e3e50fa9-0aa3-4343-81a3-7c8244cdb618,fabnetv4_KANS,L3,FABNetv4,KANS,10.138.1.0/24,10.138.1.1,Active,


Name,Short Name,Node,Network,Bandwidth,Mode,VLAN,MAC,Physical Device,Device,IP Address,Numa Node,Switch Port
Sender_KANS-Sender_KANS_NIC_Basic_nic-p1,p1,Sender_KANS,fabnetv4_KANS,100,config,,06:02:89:97:A2:13,enp7s0,enp7s0,fe80::402:89ff:fe97:a213,4,HundredGigE0/0/0/7



Time to print interfaces 186 seconds
Succeeded on site KANS with state StableOK
Succeeded in creating a slice on KANS, will avoid sites {}
Proceeding with sites ['KANS', 'NEWY', 'WASH', 'LOSA', 'DALL', 'ATLA']


### Modify the Slice to add Workers

Now that the base with the sender slice is created we will iteratively add workers on sites one at a time using first-fit policy until we get to the desired number of workers or run out of sites.

In [6]:
remaining_workers = number_of_workers
node_idx = 1
node_increment = 1
nodes_in_slice = 0 # we don't count sender in this case

while remaining_workers > 0 and len(selected_site_list) > 0:
    slice = fablib.get_slice(name=slice_name)
    
    try:
        site_name = selected_site_list[0]
        print(f'There are {remaining_workers} remaining workers to create. Trying site {site_name} from {selected_site_list}')
        expected_to_add = node_increment if remaining_workers >= node_increment else remaining_workers
        add_node_to_slice(site_name, node_idx, expected_to_add, slice)
        
        print(f'Submitting slice modification to "{slice_name}" to add {expected_to_add} nodes for site {site_name}')
        slice_id = slice.modify()
        
        # check the state of this slice
        slice = fablib.get_slice(name=slice_name)

        if check_modify(slice, selected_site_list, nodes_in_slice, expected_to_add):
            print(f'Succeeded adding {expected_to_add} nodes on site {site_name}.')
            # successfully provisioned
            node_idx += expected_to_add
            remaining_workers -= expected_to_add
            nodes_in_slice += expected_to_add
        else:
            print(f'Failed to provision on site {site_name}.')
            # this site is full, moving on
            selected_site_list.remove(site_name)            
    except Exception as e:
        remaining_workers = -1
        print(f'Unexpected exception {e}, exiting')
        break

if remaining_workers == 0:
    print('Succeeded in creating all workers')
else:
    print(f'Unable to create {remaining_workers}')



Retry: 13, Time: 171 sec


0,1
ID,7f21d307-d4a1-4a85-9841-74d8c4196f4d
Name,2-node U280 LB Tester Slice using ubuntu22 1
Lease Expiration (UTC),2025-02-28 05:24:40 +0000
Lease Start (UTC),2025-02-27 05:24:40 +0000
Project ID,bbe0d94c-736b-477a-a2e6-fef9fe7ac9ca
State,StableOK


ID,Name,Cores,RAM,Disk,Image,Image Type,Host,Site,Username,Management IP,State,Error,SSH Command,Public SSH Key File,Private SSH Key File
22d67f3b-6897-4456-9bfe-a87d4e2ec98a,Sender_KANS,8,32,100,default_ubuntu_22,qcow2,kans-w2.fabric-testbed.net,KANS,ubuntu,2001:400:a100:3060:f816:3eff:fe37:d475,Active,,ssh -i /home/vscode/.ssh/fabric_local/silver -F /home/vscode/work/fabric_config/ssh_config ubuntu@2001:400:a100:3060:f816:3eff:fe37:d475,/home/vscode/.ssh/fabric_local/silver.pub,/home/vscode/.ssh/fabric_local/silver
79a149a2-134c-4779-bd2c-22304a772e29,Worker1_KANS,8,32,100,default_ubuntu_22,qcow2,kans-w2.fabric-testbed.net,KANS,ubuntu,2001:400:a100:3060:f816:3eff:fedd:79ae,Active,,ssh -i /home/vscode/.ssh/fabric_local/silver -F /home/vscode/work/fabric_config/ssh_config ubuntu@2001:400:a100:3060:f816:3eff:fedd:79ae,/home/vscode/.ssh/fabric_local/silver.pub,/home/vscode/.ssh/fabric_local/silver


ID,Name,Layer,Type,Site,Subnet,Gateway,State,Error
e3e50fa9-0aa3-4343-81a3-7c8244cdb618,fabnetv4_KANS,L3,FABNetv4,KANS,10.138.1.0/24,10.138.1.1,Active,


Name,Short Name,Node,Network,Bandwidth,Mode,VLAN,MAC,Physical Device,Device,IP Address,Numa Node,Switch Port
Sender_KANS-Sender_KANS_NIC_Basic_nic-p1,p1,Sender_KANS,fabnetv4_KANS,100,config,,06:02:89:97:A2:13,enp7s0,enp7s0,fe80::402:89ff:fe97:a213,4,HundredGigE0/0/0/7
Worker1_KANS-Worker1_KANS_NIC_Basic_nic-p1,p1,Worker1_KANS,fabnetv4_KANS,100,config,,0E:B3:46:8A:81:46,enp7s0,enp7s0,fe80::cb3:46ff:fe8a:8146,4,HundredGigE0/0/0/7



Time to print interfaces 177 seconds


Name,State
Worker1_KANS,Active


Succeeded adding 1 nodes on site KANS.
Succeeded in creating all workers


## Get Slice Details (always execute)

The following code sets up data structures so all the follow up cells work properly. Execute it regardless of whether you just created the slice or coming back to an existing slice.

In [7]:
def find_net(net_list, name):
    return next(filter(lambda x: x.get_name() == name, net_list), None)

# this is the full /10 routable FABRIC IPv4
full_subnet = ipaddress.IPv4Network('10.128.0.0/10')

# get slice details 
slice = fablib.get_slice(name=slice_name)

a = slice.show()
nets = slice.list_networks()
nodes = slice.list_nodes()

# arrange nodes and network services by site for future convenience
net_objects = slice.get_networks()
node_objects = slice.get_nodes()

slivers_by_site = dict()

print('Arranging nodes and networks by site and getting available IP addresses')
for node in node_objects:
    node_site = node.get_site()
    if not slivers_by_site.get(node_site):
        slivers_by_site[node_site] = dict()
        slivers_by_site[node_site]['nodes'] = set()
        slivers_by_site[node_site]['net'] = find_net(net_objects, make_net_name(node_site))
    slivers_by_site[node_site]['nodes'].add(node)

print('Listing IPv4 addresses per service (per site)')
for net in net_objects:
    print(f'{net.get_name()} has {net.get_subnet()}')


0,1
ID,7f21d307-d4a1-4a85-9841-74d8c4196f4d
Name,2-node U280 LB Tester Slice using ubuntu22 1
Lease Expiration (UTC),2025-02-28 05:24:40 +0000
Lease Start (UTC),2025-02-27 05:24:40 +0000
Project ID,bbe0d94c-736b-477a-a2e6-fef9fe7ac9ca
State,StableOK


ID,Name,Layer,Type,Site,Subnet,Gateway,State,Error
e3e50fa9-0aa3-4343-81a3-7c8244cdb618,fabnetv4_KANS,L3,FABNetv4,KANS,10.138.1.0/24,10.138.1.1,Active,


ID,Name,Cores,RAM,Disk,Image,Image Type,Host,Site,Username,Management IP,State,Error,SSH Command,Public SSH Key File,Private SSH Key File
22d67f3b-6897-4456-9bfe-a87d4e2ec98a,Sender_KANS,8,32,100,default_ubuntu_22,qcow2,kans-w2.fabric-testbed.net,KANS,ubuntu,2001:400:a100:3060:f816:3eff:fe37:d475,Active,,ssh -i /home/vscode/.ssh/fabric_local/silver -F /home/vscode/work/fabric_config/ssh_config ubuntu@2001:400:a100:3060:f816:3eff:fe37:d475,/home/vscode/.ssh/fabric_local/silver.pub,/home/vscode/.ssh/fabric_local/silver
79a149a2-134c-4779-bd2c-22304a772e29,Worker1_KANS,8,32,100,default_ubuntu_22,qcow2,kans-w2.fabric-testbed.net,KANS,ubuntu,2001:400:a100:3060:f816:3eff:fedd:79ae,Active,,ssh -i /home/vscode/.ssh/fabric_local/silver -F /home/vscode/work/fabric_config/ssh_config ubuntu@2001:400:a100:3060:f816:3eff:fedd:79ae,/home/vscode/.ssh/fabric_local/silver.pub,/home/vscode/.ssh/fabric_local/silver


Arranging nodes and networks by site and getting available IP addresses
Listing IPv4 addresses per service (per site)
fabnetv4_KANS has 10.138.1.0/24


## Perform network configuration

### Set up routing

In [11]:
def get_valid_ips(ip_list, gateway, qty=1):
    # make sure the returned IP isn't the gateway
    ips = []
    while qty > 0:
        ip = ip_list.pop()
        while ip == gateway:
            ip = ip_list.pop()
        ips.append(ip)
        qty -= 1

    return ips

# allocate IP addresses in each site network services
print('Allocating IP addresses in sites where slice is present')
for site_name, site_slivers  in slivers_by_site.items():
    print(f'Processing {site_name}')
    site_net = site_slivers['net']
    site_nodes = site_slivers['nodes']
    # this properly lists needed available IPs for this subnet
    site_slivers['ips'] = list(site_net.get_subnet().hosts())[:len(site_nodes)]
    site_slivers['ips'] = get_valid_ips(list(site_net.get_subnet().hosts()), site_net.get_gateway(), len(site_nodes))
    print(f'Using the following IPs: {site_slivers["ips"]}')

Allocating IP addresses in sites where slice is present
Processing KANS
Using the following IPs: [IPv4Address('10.138.1.254'), IPv4Address('10.138.1.253')]


In [12]:
# configure node interfaces with these IP addresses
print('Configuring interfaces on nodes')
for site_name, site_slivers in slivers_by_site.items():
    print(f'Processing {site_name}')
    site_net = site_slivers['net']
    site_nodes = site_slivers['nodes']
    site_addrs = site_slivers['ips']
    for node, addr in zip(site_nodes, site_addrs):
        print(f'  Adding address {addr} to node {node.get_name()} in subnet {site_net.get_subnet()}')
        # make sure the interface is UP (in rare cases comes up in DOWN state)
        node_iface = node.get_interface(network_name=site_net.get_name())
        execute_single_node(node, [f'sudo ip link set {node_iface.get_os_interface()} up'])
        node_iface.ip_addr_add(addr=addr, subnet=site_net.get_subnet())
        node.ip_route_add(subnet=full_subnet, gateway=site_net.get_gateway())


Configuring interfaces on nodes
Processing KANS
  Adding address 10.138.1.254 to node Sender_KANS in subnet 10.138.1.0/24
	Executing "sudo ip link set enp7s0 up" on node Sender_KANS
  Adding address 10.138.1.253 to node Worker1_KANS in subnet 10.138.1.0/24
	Executing "sudo ip link set enp7s0 up" on node Worker1_KANS


In [13]:
# We need to setup the firewall to allow UDP traffic to pass between the nodes
# we do this by moving all interfaces (loopback, management and data) to the 'trusted' zone
# since we are in FABNetv4 we can keep things simple

for site_name, site_slivers in slivers_by_site.items():
    print(f'Processing {site_name}')
    site_net = site_slivers['net']
    for node in site_slivers['nodes']:
        # install the firewall-cmd package if it is not already installed
        execute_single_node(node, ['sudo apt-get update'])
        execute_single_node(node, ['sudo apt-get install -yq firewalld'])
        mgmt_iface_name = get_management_os_interface(node)
        data_iface = node.get_interface(network_name=site_net.get_name())
        data_iface_name = data_iface.get_os_interface()
        commands = [
            f'sudo firewall-cmd --permanent --zone=trusted --add-interface={data_iface_name}',
            f'sudo firewall-cmd --permanent --zone=trusted --add-interface=lo',
            f'sudo firewall-cmd --permanent --zone=trusted --add-interface={mgmt_iface_name}',
            f'for i in $(sudo firewall-cmd --zone=public --list-services); do sudo firewall-cmd --zone=public --permanent --remove-service=$i; done',
        ]
        commands.append(f'sudo firewall-cmd --reload')
        commands.append(f'sudo firewall-cmd --list-all --zone=trusted')
        execute_commands([node], commands)

Processing KANS
	Executing "sudo apt-get update" on node Sender_KANS
Get:1 http://nova.clouds.archive.ubuntu.com/ubuntu jammy InRelease [270 kB]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 http://nova.clouds.archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:4 http://nova.clouds.archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2103 kB]
Get:6 http://nova.clouds.archive.ubuntu.com/ubuntu jammy/universe amd64 Packages [14.1 MB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security/main Translation-en [328 kB]
Get:8 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [2879 kB]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/restricted Translation-en [505 kB]
Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [965 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/universe Translation-en [

## Tune Buffers and MTUs

In order to have good performance we need to
- Make the UDP send/receive socket buffer size limit larger (applications are assumed to know how to make their buffers larger up to this limit)
- Set MTU to 9k and test with DF=0 ping

In [14]:
# setup UDP socket buffer sizes to 512M
commands = [
    f"sudo sysctl net.core.rmem_max=536870912",
    f"sudo sysctl net.core.wmem_max=536870912",
    f"sysctl net.core.wmem_max net.core.rmem_max"
]
# walk the nodes
for site_name, site_slivers in slivers_by_site.items():
    for node in site_slivers['nodes']:
        execute_single_node(node, commands)

	Executing "sudo sysctl net.core.rmem_max=536870912" on node Sender_KANS
net.core.rmem_max = 536870912
	Executing "sudo sysctl net.core.wmem_max=536870912" on node Sender_KANS
net.core.wmem_max = 536870912
	Executing "sysctl net.core.wmem_max net.core.rmem_max" on node Sender_KANS
net.core.wmem_max = 536870912
net.core.rmem_max = 536870912
	Executing "sudo sysctl net.core.rmem_max=536870912" on node Worker1_KANS
net.core.rmem_max = 536870912
	Executing "sudo sysctl net.core.wmem_max=536870912" on node Worker1_KANS
net.core.wmem_max = 536870912
	Executing "sysctl net.core.wmem_max net.core.rmem_max" on node Worker1_KANS
net.core.wmem_max = 536870912
net.core.rmem_max = 536870912


In [15]:
# set 9k MTU on dataplane interfaces
mtu=9000

for site_name, site_slivers in slivers_by_site.items():
    site_net = site_slivers['net']
    for node in site_slivers['nodes']:
        data_iface = node.get_interface(network_name=site_net.get_name())
        data_iface_name = data_iface.get_os_interface()
        execute_single_node(node, [f"sudo ip link set dev {data_iface_name} mtu {mtu}"])

	Executing "sudo ip link set dev enp7s0 mtu 9000" on node Sender_KANS
	Executing "sudo ip link set dev enp7s0 mtu 9000" on node Worker1_KANS


In [36]:
# run a test from every node to the LB node
for site_name, site_slivers in slivers_by_site.items():
    for node in site_slivers['nodes']:
        print(f'Node {node.get_name()} pinging {lb_node_ip}')
        #execute_single_node(node, [f"sudo ping -q -f -s 8972 -c 100 -M do {lb_node_ip}"])
        execute_single_node(node, [f"sudo ping -q -f -c 100 {lb_node_ip}"])

Node Worker1_KANS pinging 10.138.47.254
	Executing "sudo ping -q -f -c 100 10.138.47.254" on node Worker1_KANS
PING 10.138.47.254 (10.138.47.254) 56(84) bytes of data.

--- 10.138.47.254 ping statistics ---
100 packets transmitted, 0 received, 100% packet loss, time 1587ms

Node Sender_KANS pinging 10.138.47.254
	Executing "sudo ping -q -f -c 100 10.138.47.254" on node Sender_KANS
PING 10.138.47.254 (10.138.47.254) 56(84) bytes of data.

--- 10.138.47.254 ping statistics ---
100 packets transmitted, 0 received, 100% packet loss, time 1587ms



In [16]:
# Find sender and receiver nodes
sender_name = None
receiver_name = None

for site_name, site_slivers in slivers_by_site.items():
    for node in site_slivers['nodes']:
        if 'Sender' in node.get_name() and not sender_name:
            sender_name = node.get_name()
            sender_net = site_slivers['net']
        if 'Worker' in node.get_name() and not receiver_name:
            receiver_name = node.get_name()
            receiver_net = site_slivers['net']
        if sender_name and receiver_name:
            break
    if sender_name and receiver_name:
        break

if not sender_name or not receiver_name:
    print("Error: Could not find both sender and worker nodes")
else:
    # Get sender and receiver IPs
    sender_ip = slice.get_node(sender_name).get_interface(network_name=sender_net.get_name()).get_ip_addr()
    receiver_ip = slice.get_node(receiver_name).get_interface(network_name=receiver_net.get_name()).get_ip_addr()

    # ping between sender and receiver nodes to verify connectivity
    print(f'Sender node {sender_name} pinging receiver {receiver_name}')
    execute_single_node(slice.get_node(sender_name), [f"sudo ping -q -f -c 100 {receiver_ip}"])





Sender node Sender_KANS pinging receiver Worker1_KANS
	Executing "sudo ping -q -f -c 100 10.138.1.253" on node Sender_KANS
PING 10.138.1.253 (10.138.1.253) 56(84) bytes of data.

--- 10.138.1.253 ping statistics ---
100 packets transmitted, 100 received, 0% packet loss, time 6ms
rtt min/avg/max/mdev = 0.038/0.049/0.479/0.047 ms, ipg/ewma 0.063/0.042 ms


# Prepare the nodes for running e2sar-docker

## Install docker on all nodes

In [17]:
# Install docker  on all nodes
for site_name, site_slivers in slivers_by_site.items():
    for node in site_slivers['nodes']:
        execute_single_node(node, ['sudo apt-get install -yq docker.io'])
        execute_single_node(node, ['sudo systemctl start docker'])
        execute_single_node(node, ['sudo systemctl enable docker']) 

# add the current user to the docker group
execute_single_node(node, ['sudo usermod -aG docker $USER'])



	Executing "sudo apt-get install -yq docker.io" on node Sender_KANS
Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  bridge-utils containerd dns-root-data dnsmasq-base pigz runc ubuntu-fan
Suggested packages:
  ifupdown aufs-tools cgroupfs-mount | cgroup-lite debootstrap docker-buildx
  docker-compose-v2 docker-doc rinse zfs-fuse | zfsutils
The following NEW packages will be installed:
  bridge-utils containerd dns-root-data dnsmasq-base docker.io pigz runc
  ubuntu-fan
0 upgraded, 8 newly installed, 0 to remove and 66 not upgraded.
Need to get 78.7 MB of archives.
After this operation, 301 MB of additional disk space will be used.
Get:1 http://nova.clouds.archive.ubuntu.com/ubuntu jammy/universe amd64 pigz amd64 2.6-1 [63.6 kB]
Get:2 http://nova.clouds.archive.ubuntu.com/ubuntu jammy/main amd64 bridge-utils amd64 1.7-1ubuntu3 [34.4 kB]
Get:3 http://nova.clouds.archive.ubuntu.com/ubuntu jammy-update

## Pull the e2sar-docker image

In [27]:
# pull image from docker hub jlabtsai/e2sar-container:latest
for site_name, site_slivers in slivers_by_site.items():
    for node in site_slivers['nodes']:
        execute_single_node(node, ['sudo docker pull jlabtsai/e2sar-container:latest'])
        execute_single_node(node, ['sudo docker tag jlabtsai/e2sar-container:latest e2sar-container'])


	Executing "sudo docker pull jlabtsai/e2sar-container:latest" on node Sender_KANS
latest: Pulling from jlabtsai/e2sar-container
Digest: sha256:c5571cbf0c9983e113689e24471e8fe94df06305bb6cb0d47ace536b03deb0eb
Status: Image is up to date for jlabtsai/e2sar-container:latest
docker.io/jlabtsai/e2sar-container:latest
	Executing "sudo docker tag jlabtsai/e2sar-container:latest e2sar-container" on node Sender_KANS
	Executing "sudo docker pull jlabtsai/e2sar-container:latest" on node Worker1_KANS
latest: Pulling from jlabtsai/e2sar-container
Digest: sha256:c5571cbf0c9983e113689e24471e8fe94df06305bb6cb0d47ace536b03deb0eb
Status: Image is up to date for jlabtsai/e2sar-container:latest
docker.io/jlabtsai/e2sar-container:latest
	Executing "sudo docker tag jlabtsai/e2sar-container:latest e2sar-container" on node Worker1_KANS


## Prepare for running e2sar-docker

In [24]:
# Clone the DPU_K8s repo with e2sar-on-fabric branch on all nodes
for site_name, site_slivers in slivers_by_site.items():
    for node in site_slivers['nodes']:
        # Remove existing repo if it exists
        execute_single_node(node, ['rm -rf DPU_K8s'])
        # Clone the repo with specific branch
        execute_single_node(node, ['git clone -b e2sar-on-fabric https://github.com/cissieAB/DPU_K8s.git'])


	Executing "rm -rf DPU_K8s" on node Sender_KANS
	Executing "git clone -b e2sar-on-fabric https://github.com/cissieAB/DPU_K8s.git" on node Sender_KANS
[31mCloning into 'DPU_K8s'...
[0m	Executing "rm -rf DPU_K8s" on node Worker1_KANS
	Executing "git clone -b e2sar-on-fabric https://github.com/cissieAB/DPU_K8s.git" on node Worker1_KANS
[31mCloning into 'DPU_K8s'...
[0m

# Run e2sar-docker sender and receiver

## Get IPs of sender and receiver nodes

In [None]:
# Get the sender node (with "Sender" in name)
sender_node = None
for site_name, site_slivers in slivers_by_site.items():
    for node in site_slivers['nodes']:
        if 'Sender' in node.get_name():
            sender_node = node
            # Get IPv4 address from interface
            node_iface = node.get_interface(network_name=site_slivers['net'].get_name())
            sender_ip = node_iface.get_ip_addr()
            break

if not sender_node:
    raise Exception("Could not find Sender node")

# Get worker node for receiver  
receiver_node = None
for site_name, site_slivers in slivers_by_site.items():
    for node in site_slivers['nodes']:
        if 'Worker' in node.get_name():
            receiver_node = node
            # Get IPv4 address from interface
            node_iface = node.get_interface(network_name=site_slivers['net'].get_name())
            receiver_ip = node_iface.get_ip_addr()
            break

if not receiver_node:
    raise Exception("Could not find node for receiver")

print(f"Sender IP: {sender_ip}")
print(f"Receiver IP: {receiver_ip}")

## Run e2sar-docker sender and receiver

In [None]:


# Remove any existing containers
execute_single_node(sender_node, ['sudo docker rm -f e2sar-sender'])
execute_single_node(receiver_node, ['sudo docker rm -f e2sar-receiver'])

# Start receiver on receiver node with sudo
execute_single_node(receiver_node, [
    'cd DPU_K8s/run-E2SAR/container && '
    'sudo ./deploy.sh receiver '
    '--direct '
    f'--ip {receiver_ip} '
    '--threads 4 '
    '--duration 300'
])

# Wait for receiver to initialize
time.sleep(5)

# Start sender on worker node with sudo
execute_single_node(sender_node, [
    'cd DPU_K8s/run-E2SAR/container && '
    'sudo ./deploy.sh sender '
    '--direct '
    f'--ip {sender_ip} '
    f'--target-ip {receiver_ip} '
    '--rate 20 '
    '--length 2000000'
])


## Check logs

In [41]:
# Check receiver logs
print("=== Receiver Logs ===")
execute_single_node(receiver_node, [
    'sudo docker logs e2sar-receiver'
])

# Check sender logs 
print("\n=== Sender Logs ===")
execute_single_node(sender_node, [
    'sudo docker logs e2sar-sender'
])


=== Receiver Logs ===
	Executing "sudo docker logs e2sar-receiver" on node Worker1_KANS
E2SAR Version: 0.1.4
Control plane will be OFF
Using Unassigned Threads
Will run for 300 sec
*** Make sure the URI reflects proper data address, other parts are ignored.
Receiving on ports 19522:19525
Stats:
	Events Received: 0
	Events Mangled: 0
	Events Lost: 0
	Data Errors: 0
	gRPC Errors: 0
	Events lost so far: 
Stats:
	Events Received: 0
	Events Mangled: 0
	Events Lost: 0
	Data Errors: 0
	gRPC Errors: 0
	Events lost so far: 
Stats:
	Events Received: 0
	Events Mangled: 0
	Events Lost: 0
	Data Errors: 0
	gRPC Errors: 0
	Events lost so far: 
Stats:
	Events Received: 0
	Events Mangled: 0
	Events Lost: 0
	Data Errors: 0
	gRPC Errors: 0
	Events lost so far: 
Stats:
	Events Received: 0
	Events Mangled: 0
	Events Lost: 0
	Data Errors: 0
	gRPC Errors: 0
	Events lost so far: 
Stats:
	Events Received: 0
	Events Mangled: 0
	Events Lost: 0
	Data Errors: 0
	gRPC Errors: 0
	Events lost so far: 
Stats:
	Events 

# Manage the slice

### Extend by two weeks

In [None]:
# Set end host to now plus 14 days
end_date = (datetime.now(timezone.utc) + timedelta(days=14)).strftime("%Y-%m-%d %H:%M:%S %z")

try:
    slice = fablib.get_slice(name=slice_name)

    slice.renew(end_date)
except Exception as e:
    print(f"Exception: {e}")

### Delete

In [None]:
slice = fablib.get_slice(slice_name)
slice.delete()