Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI Rework: More Aggressive Culling Of FPGA Resources, Slack/PR Notifications #1316

Merged
merged 9 commits into from
Dec 10, 2022
142 changes: 89 additions & 53 deletions .github/scripts/ci_variables.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,97 @@
import os
from typing import TypedDict

# This package contains utilities that rely on environment variable
# definitions present only on the CI container instance.

# If not running under a CI pipeline defaults are provided that
# will suffice to run scripts that do not use GHA API calls.
# To manually provide environment variable settings, export GITHUB_ACTIONS=true, and provide
# values for all of the environment variables below.
RUN_LOCAL = not os.environ.get('GITHUB_ACTIONS', False)
RUN_AZURE_CREDITED_ENV = bool(os.environ.get('AZURE_CREDITED_ENV', False))
t14916 marked this conversation as resolved.
Show resolved Hide resolved
# environment variables needed by CI
class CIEnvironment(TypedDict):
# If not running under a CI pipeline defaults are provided that
# will suffice to run scripts that do not use GHA API calls.
# To manually provide environment variable settings, export GITHUB_ACTIONS=true, and provide
# values for all of the environment variables listed.
GITHUB_ACTIONS: str

# This is used as a unique tag for all instances launched in a workflow
GITHUB_RUN_ID: str

GITHUB_SHA: str

# Multiple clones of the FireSim repository exists on manager. We expect state
# to persist between jobs in a workflow and faciliate that by having jobs run
# out of a centralized clone (MANAGER_FIRESIM_LOCATION)-- not the default clones setup by
# the GHA runners (GITHUB_WORKSPACE)

# This is the location of the clone setup by the GHA runner infrastructure by default
# expanduser to replace the ~ present in the default, for portability
GITHUB_WORKSPACE: str

# This is the location of the reused clone. CI scripts should refer variables
# derived from this path so that they may be reused across workflows that may
# initialize the FireSim repository differently (e.g., as a submodule of a
# larger project.)
MANAGER_FIRESIM_LOCATION: str

GITHUB_TOKEN: str
PERSONAL_ACCESS_TOKEN: str
GITHUB_API_URL: str

# We look this up, instead of hardcoding "firesim/firesim", to support running
# this CI pipeline under forks.
GITHUB_REPOSITORY: str

GITHUB_EVENT_PATH: str

# The following are environment variables used by AWS and AZURE to setup the corresponding
# self-hosted Github Actions Runners
AWS_ACCESS_KEY_ID: str
AWS_SECRET_ACCESS_KEY: str
AWS_DEFAULT_REGION: str
AZURE_CLIENT_ID: str
AZURE_CLIENT_SECRET: str
AZURE_TENANT_ID: str
AZURE_SUBSCRIPTION_ID: str
AZURE_DEFAULT_REGION: str
AZURE_RESOURCE_GROUP: str
AZURE_CI_SUBNET_ID: str
AZURE_CI_NSG_ID: str

FIRESIM_PEM: str
FIRESIM_PEM_PUBLIC: str

RUN_LOCAL = os.environ.get('GITHUB_ACTIONS', 'false') == 'false'
# When running locally (not in a CI pipeline) run commands out of the clone hosting this file.
local_fsim_dir = os.path.normpath((os.path.realpath(__file__)) + "/../../..")

# CI instance environment variables

# This is used as a unique tag for all instances launched in a workflow
ci_workflow_run_id = os.environ['GITHUB_RUN_ID'] if not RUN_LOCAL else 0
ci_commit_sha1 = os.environ['GITHUB_SHA'] if not RUN_LOCAL else 0

# Multiple clones of the FireSim repository exists on manager. We expect state
# to persist between jobs in a workflow and faciliate that by having jobs run
# out of a centralized clone (ci_firesim_dir)-- not the default clones setup by
# the GHA runners (ci_workdir)

# This is the location of the clone setup by the GHA runner infrastructure by default
# expanduser to replace the ~ present in the default, for portability
ci_workdir = os.path.expanduser(os.environ['GITHUB_WORKSPACE']) if not RUN_LOCAL else local_fsim_dir

# This is the location of the reused clone. CI scripts should refer variables
# derived from this path so that they may be reused across workflows that may
# initialize the FireSim repository differently (e.g., as a submodule of a
# larger project.)
ci_firesim_dir = os.path.expanduser(os.environ['MANAGER_FIRESIM_LOCATION']) if not RUN_LOCAL else local_fsim_dir

ci_api_token = os.environ['GITHUB_TOKEN'] if not RUN_LOCAL else 0
ci_personal_api_token = os.environ['PERSONAL_ACCESS_TOKEN'] if not RUN_LOCAL else 0

ci_gha_api_url = os.environ['GITHUB_API_URL'] if not RUN_LOCAL else ""
# We look this up, instead of hardcoding "firesim/firesim", to support running
# this CI pipeline under forks.
ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else ""

# The following are environment variables used by AWS and AZURE to setup the corresponding
# self-hosted Github Actions Runners

ci_aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] if not RUN_LOCAL else ""
ci_aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] if not RUN_LOCAL else ""
ci_aws_default_region = os.environ['AWS_DEFAULT_REGION'] if not RUN_LOCAL else ""

ci_azure_client_id = os.environ['AZURE_CLIENT_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_client_secret = os.environ['AZURE_CLIENT_SECRET'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_tenant_id = os.environ['AZURE_TENANT_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_sub_id = os.environ['AZURE_SUBSCRIPTION_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_default_region = os.environ['AZURE_DEFAULT_REGION'] if not RUN_LOCAL else ""
ci_azure_resource_group = os.environ['AZURE_RESOURCE_GROUP'] if not RUN_LOCAL else ""
ci_azure_subnet_id = os.environ['AZURE_CI_SUBNET_ID'] if not RUN_LOCAL else ""
ci_azure_nsg_id = os.environ['AZURE_CI_NSG_ID'] if not RUN_LOCAL else ""

ci_firesim_pem = os.environ['FIRESIM_PEM'] if not RUN_LOCAL else ""
ci_firesim_pem_public = os.environ['FIRESIM_PEM_PUBLIC'] if not RUN_LOCAL else ""
def get_ci_value(env_var: str, default_value: str = "") -> str:
if RUN_LOCAL:
return default_value
else:
return os.environ[env_var]

# Create a env. dict that is populated from the environment or from defaults
ci_env: CIEnvironment = {
'GITHUB_ACTIONS': 'false' if RUN_LOCAL else 'true',
'GITHUB_RUN_ID': get_ci_value('GITHUB_RUN_ID'),
'GITHUB_SHA': get_ci_value('GITHUB_RUN_ID'),
'GITHUB_WORKSPACE': os.path.expanduser(os.environ['GITHUB_WORKSPACE']) if not RUN_LOCAL else local_fsim_dir,
'MANAGER_FIRESIM_LOCATION': os.path.expanduser(os.environ['MANAGER_FIRESIM_LOCATION']) if not RUN_LOCAL else local_fsim_dir,
'GITHUB_TOKEN': get_ci_value('GITHUB_TOKEN'),
'PERSONAL_ACCESS_TOKEN': get_ci_value('PERSONAL_ACCESS_TOKEN'),
'GITHUB_API_URL': get_ci_value('GITHUB_API_URL'),
'GITHUB_REPOSITORY': get_ci_value('GITHUB_REPOSITORY'),
'GITHUB_EVENT_PATH': get_ci_value('GITHUB_EVENT_PATH'),
'AWS_ACCESS_KEY_ID': get_ci_value('AWS_ACCESS_KEY_ID'),
'AWS_SECRET_ACCESS_KEY': get_ci_value('AWS_SECRET_ACCESS_KEY'),
'AWS_DEFAULT_REGION': get_ci_value('AWS_DEFAULT_REGION'),
'AZURE_CLIENT_ID': get_ci_value('AZURE_CLIENT_ID'),
'AZURE_CLIENT_SECRET': get_ci_value('AZURE_CLIENT_SECRET'),
'AZURE_TENANT_ID': get_ci_value('AZURE_TENANT_ID'),
'AZURE_SUBSCRIPTION_ID': get_ci_value('AZURE_SUBSCRIPTION_ID'),
'AZURE_DEFAULT_REGION': get_ci_value('AZURE_DEFAULT_REGION'),
'AZURE_RESOURCE_GROUP': get_ci_value('AZURE_RESOURCE_GROUP'),
'AZURE_CI_SUBNET_ID': get_ci_value('AZURE_CI_SUBNET_ID'),
'AZURE_CI_NSG_ID': get_ci_value('AZURE_CI_NSG_ID'),
'FIRESIM_PEM': get_ci_value('FIRESIM_PEM'),
'FIRESIM_PEM_PUBLIC': get_ci_value('FIRESIM_PEM_PUBLIC'),
}
54 changes: 5 additions & 49 deletions .github/scripts/common.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
import math
from fabric.api import *
import requests
from ci_variables import ci_gha_api_url, ci_repo_name, ci_firesim_dir

from typing import Dict, List, Any
from typing import Dict

from platform_lib import PlatformLib, AWSPlatformLib, AzurePlatformLib, Platform

# Github URL related constants
gha_api_url = f"{ci_gha_api_url}/repos/{ci_repo_name}/actions"
gha_runners_api_url = f"{gha_api_url}/runners"
gha_runs_api_url = f"{gha_api_url}/runs"
from ci_variables import ci_env
from github_common import deregister_runners

# Remote paths
manager_home_dir = "/home/centos"
manager_fsim_pem = manager_home_dir + "/firesim.pem"
manager_fsim_dir = ci_firesim_dir
manager_fsim_dir = ci_env['MANAGER_FIRESIM_LOCATION']
manager_marshal_dir = manager_fsim_dir + "/sw/firesim-software"
manager_ci_dir = manager_fsim_dir + "/.github/scripts"

Expand All @@ -27,48 +22,9 @@
env.disable_known_hosts = True
env.keepalive = 60 # keep long SSH connections running

def set_fabric_firesim_pem():
def set_fabric_firesim_pem() -> None:
env.key_filename = manager_fsim_pem

def get_header(gh_token: str) -> Dict[str, str]:
return {"Authorization": f"token {gh_token.strip()}", "Accept": "application/vnd.github+json"}

def get_runners(gh_token: str) -> List:
r = requests.get(gha_runners_api_url, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception(f"Unable to retrieve count of GitHub Actions Runners\nFull Response Below:\n{r}")
res_dict = r.json()
runner_count = res_dict["total_count"]

runners = []
for page_idx in range(math.ceil(runner_count / 30)):
r = requests.get(gha_runners_api_url, params={"per_page" : 30, "page" : page_idx + 1}, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception(f"Unable to retrieve (sub)list of GitHub Actions Runners\nFull Response Below\n{r}")
res_dict = r.json()
runners = runners + res_dict["runners"]

return runners

def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool:
r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=get_header(gh_token))
if r.status_code != 204:
print(f"""Unable to delete runner {runner["name"]} with id: {runner["id"]}\nFull Response Below\n{r}""")
return False
return True

def deregister_offline_runners(gh_token: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner["status"] == "offline":
delete_runner(gh_token, runner)

def deregister_runners(gh_token: str, runner_name: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner_name in runner["name"]:
delete_runner(gh_token, runner)

aws_platform_lib = AWSPlatformLib(deregister_runners)
#azure_platform_lib = AzurePlatformLib(deregister_runners)

Expand Down
62 changes: 35 additions & 27 deletions .github/scripts/cull-old-ci-instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,61 +4,69 @@
# instances that have exceeded a lifetime limit

import datetime
from typing import Iterable, Tuple, Any
from xmlrpc.client import DateTime
import pytz
import boto3
import sys
from platform_lib import Platform
from common import deregister_runners, get_platform_lib

from platform_lib import Platform, find_timed_out_resources
from common import get_platform_lib
from github_common import deregister_runners

# Reuse manager utilities
from ci_variables import ci_workdir, ci_personal_api_token, ci_workflow_run_id, ci_azure_sub_id
sys.path.append(ci_workdir + "/deploy")
from ci_variables import ci_env
sys.path.append(ci_env['GITHUB_WORKSPACE'] + "/deploy")

# The number of hours an instance may exist since its initial launch time
# The number of hours a manager instance may exist since its initial launch time
INSTANCE_LIFETIME_LIMIT_HOURS = 8

def find_timed_out_resources(current_time: DateTime, resource_list: Iterable[Tuple]) -> list:
"""
Because of the differences in how AWS and Azure store time tags, the resource_list
in this case is a list of tuples with the 0 index being the instance/vm and the 1 index
a datetime object corresponding to the time
"""
timed_out = []
for resource_tuple in resource_list:
lifetime_secs = (current_time - resource_tuple[1]).total_seconds()
if lifetime_secs > (INSTANCE_LIFETIME_LIMIT_HOURS * 3600):
timed_out.append(resource_tuple[0])
return timed_out
# The number of hours a fpga instance may exist since its initial launch time
FPGA_INSTANCE_LIFETIME_LIMIT_HOURS = 1

def cull_aws_instances(current_time: DateTime) -> None:
# Grab all instances with a CI-generated tag
aws_platform_lib = get_platform_lib(Platform.AWS)
all_ci_instances = aws_platform_lib.find_all_ci_instances()

client = boto3.client('ec2')

instances_to_terminate = find_timed_out_resources(current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
run_farm_ci_instances = aws_platform_lib.find_run_farm_ci_instances()
instances_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, map(lambda x: (x, x['LaunchTime']), run_farm_ci_instances))
run_farm_instances_to_terminate = list(set(instances_to_terminate))
print("Terminated Run Farm Instances:")
for inst in run_farm_instances_to_terminate:
client.terminate_instances(InstanceIds=[inst['InstanceId']])
print(" " + inst['InstanceId'])

print("Terminated Instances:")
for inst in instances_to_terminate:
deregister_runners(ci_personal_api_token, f"aws-{ci_workflow_run_id}")
all_ci_instances = aws_platform_lib.find_all_ci_instances()
instances_to_terminate = find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
manager_instances_to_terminate = list(set(instances_to_terminate))
print("Terminated Manager Instances:")
for inst in manager_instances_to_terminate:
deregister_runners(ci_env['PERSONAL_ACCESS_TOKEN'], f"aws-{ci_env['GITHUB_RUN_ID']}")
client.terminate_instances(InstanceIds=[inst['InstanceId']])
print(" " + inst['InstanceId'])

if len(manager_instances_to_terminate) > 0 or len(run_farm_instances_to_terminate) > 0:
exit(1)

def cull_azure_resources(current_time: DateTime) -> None:
azure_platform_lib = get_platform_lib(Platform.AZURE)
all_azure_ci_vms = azure_platform_lib.find_all_ci_instances()
run_farm_azure_ci_vms = azure_platform_lib.find_run_farm_ci_instances()

vms_to_terminate = find_timed_out_resources(current_time, \
vms_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, \
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), run_farm_azure_ci_vms))
vms_to_terminate += find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, \
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), all_azure_ci_vms))
vms_to_terminate = list(set(vms_to_terminate))

print("VMs:")
print("Terminated VMs:")
for vm in vms_to_terminate:
deregister_runners(ci_personal_api_token, f"azure-{ci_workflow_run_id}")
deregister_runners(ci_env['PERSONAL_ACCESS_TOKEN'], f"azure-{ci_env['GITHUB_RUN_ID']}")
azure_platform_lib.terminate_azure_vms([vm]) #prints are handled in here

if len(vms_to_terminate) > 0:
exit(1)

def main():
# Get a timezone-aware datetime instance
current_time = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
Expand Down
6 changes: 3 additions & 3 deletions .github/scripts/cull-old-ci-runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
# Runs periodically in it's own workflow in the CI/CD environment to teardown
# runners that are offline

from common import deregister_offline_runners
from github_common import deregister_offline_runners

# Reuse manager utilities
from ci_variables import ci_personal_api_token
from ci_variables import ci_env

def main():
# deregister all offline runners
deregister_offline_runners(ci_personal_api_token)
deregister_offline_runners(ci_env['PERSONAL_ACCESS_TOKEN'])

if __name__ == "__main__":
main()
Loading