Skip to content

Commit

Permalink
Merge pull request #1316 from firesim/fix-ci-leakages
Browse files Browse the repository at this point in the history
CI Rework: More Aggressive Culling Of FPGA Resources, Slack/PR Notifications
  • Loading branch information
abejgonzalez committed Dec 10, 2022
2 parents 873da7d + 46880ce commit 35f4f5a
Show file tree
Hide file tree
Showing 21 changed files with 460 additions and 274 deletions.
142 changes: 89 additions & 53 deletions .github/scripts/ci_variables.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,97 @@
import os
from typing import TypedDict

# This package contains utilities that rely on environment variable
# definitions present only on the CI container instance.

# If not running under a CI pipeline defaults are provided that
# will suffice to run scripts that do not use GHA API calls.
# To manually provide environment variable settings, export GITHUB_ACTIONS=true, and provide
# values for all of the environment variables below.
RUN_LOCAL = not os.environ.get('GITHUB_ACTIONS', False)
RUN_AZURE_CREDITED_ENV = bool(os.environ.get('AZURE_CREDITED_ENV', False))
# environment variables needed by CI
class CIEnvironment(TypedDict):
# If not running under a CI pipeline defaults are provided that
# will suffice to run scripts that do not use GHA API calls.
# To manually provide environment variable settings, export GITHUB_ACTIONS=true, and provide
# values for all of the environment variables listed.
GITHUB_ACTIONS: str

# This is used as a unique tag for all instances launched in a workflow
GITHUB_RUN_ID: str

GITHUB_SHA: str

# Multiple clones of the FireSim repository exists on manager. We expect state
# to persist between jobs in a workflow and faciliate that by having jobs run
# out of a centralized clone (MANAGER_FIRESIM_LOCATION)-- not the default clones setup by
# the GHA runners (GITHUB_WORKSPACE)

# This is the location of the clone setup by the GHA runner infrastructure by default
# expanduser to replace the ~ present in the default, for portability
GITHUB_WORKSPACE: str

# This is the location of the reused clone. CI scripts should refer variables
# derived from this path so that they may be reused across workflows that may
# initialize the FireSim repository differently (e.g., as a submodule of a
# larger project.)
MANAGER_FIRESIM_LOCATION: str

GITHUB_TOKEN: str
PERSONAL_ACCESS_TOKEN: str
GITHUB_API_URL: str

# We look this up, instead of hardcoding "firesim/firesim", to support running
# this CI pipeline under forks.
GITHUB_REPOSITORY: str

GITHUB_EVENT_PATH: str

# The following are environment variables used by AWS and AZURE to setup the corresponding
# self-hosted Github Actions Runners
AWS_ACCESS_KEY_ID: str
AWS_SECRET_ACCESS_KEY: str
AWS_DEFAULT_REGION: str
AZURE_CLIENT_ID: str
AZURE_CLIENT_SECRET: str
AZURE_TENANT_ID: str
AZURE_SUBSCRIPTION_ID: str
AZURE_DEFAULT_REGION: str
AZURE_RESOURCE_GROUP: str
AZURE_CI_SUBNET_ID: str
AZURE_CI_NSG_ID: str

FIRESIM_PEM: str
FIRESIM_PEM_PUBLIC: str

RUN_LOCAL = os.environ.get('GITHUB_ACTIONS', 'false') == 'false'
# When running locally (not in a CI pipeline) run commands out of the clone hosting this file.
local_fsim_dir = os.path.normpath((os.path.realpath(__file__)) + "/../../..")

# CI instance environment variables

# This is used as a unique tag for all instances launched in a workflow
ci_workflow_run_id = os.environ['GITHUB_RUN_ID'] if not RUN_LOCAL else 0
ci_commit_sha1 = os.environ['GITHUB_SHA'] if not RUN_LOCAL else 0

# Multiple clones of the FireSim repository exists on manager. We expect state
# to persist between jobs in a workflow and faciliate that by having jobs run
# out of a centralized clone (ci_firesim_dir)-- not the default clones setup by
# the GHA runners (ci_workdir)

# This is the location of the clone setup by the GHA runner infrastructure by default
# expanduser to replace the ~ present in the default, for portability
ci_workdir = os.path.expanduser(os.environ['GITHUB_WORKSPACE']) if not RUN_LOCAL else local_fsim_dir

# This is the location of the reused clone. CI scripts should refer variables
# derived from this path so that they may be reused across workflows that may
# initialize the FireSim repository differently (e.g., as a submodule of a
# larger project.)
ci_firesim_dir = os.path.expanduser(os.environ['MANAGER_FIRESIM_LOCATION']) if not RUN_LOCAL else local_fsim_dir

ci_api_token = os.environ['GITHUB_TOKEN'] if not RUN_LOCAL else 0
ci_personal_api_token = os.environ['PERSONAL_ACCESS_TOKEN'] if not RUN_LOCAL else 0

ci_gha_api_url = os.environ['GITHUB_API_URL'] if not RUN_LOCAL else ""
# We look this up, instead of hardcoding "firesim/firesim", to support running
# this CI pipeline under forks.
ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else ""

# The following are environment variables used by AWS and AZURE to setup the corresponding
# self-hosted Github Actions Runners

ci_aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] if not RUN_LOCAL else ""
ci_aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] if not RUN_LOCAL else ""
ci_aws_default_region = os.environ['AWS_DEFAULT_REGION'] if not RUN_LOCAL else ""

ci_azure_client_id = os.environ['AZURE_CLIENT_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_client_secret = os.environ['AZURE_CLIENT_SECRET'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_tenant_id = os.environ['AZURE_TENANT_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_sub_id = os.environ['AZURE_SUBSCRIPTION_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_default_region = os.environ['AZURE_DEFAULT_REGION'] if not RUN_LOCAL else ""
ci_azure_resource_group = os.environ['AZURE_RESOURCE_GROUP'] if not RUN_LOCAL else ""
ci_azure_subnet_id = os.environ['AZURE_CI_SUBNET_ID'] if not RUN_LOCAL else ""
ci_azure_nsg_id = os.environ['AZURE_CI_NSG_ID'] if not RUN_LOCAL else ""

ci_firesim_pem = os.environ['FIRESIM_PEM'] if not RUN_LOCAL else ""
ci_firesim_pem_public = os.environ['FIRESIM_PEM_PUBLIC'] if not RUN_LOCAL else ""
def get_ci_value(env_var: str, default_value: str = "") -> str:
if RUN_LOCAL:
return default_value
else:
return os.environ[env_var]

# Create a env. dict that is populated from the environment or from defaults
ci_env: CIEnvironment = {
'GITHUB_ACTIONS': 'false' if RUN_LOCAL else 'true',
'GITHUB_RUN_ID': get_ci_value('GITHUB_RUN_ID'),
'GITHUB_SHA': get_ci_value('GITHUB_RUN_ID'),
'GITHUB_WORKSPACE': os.path.expanduser(os.environ['GITHUB_WORKSPACE']) if not RUN_LOCAL else local_fsim_dir,
'MANAGER_FIRESIM_LOCATION': os.path.expanduser(os.environ['MANAGER_FIRESIM_LOCATION']) if not RUN_LOCAL else local_fsim_dir,
'GITHUB_TOKEN': get_ci_value('GITHUB_TOKEN'),
'PERSONAL_ACCESS_TOKEN': get_ci_value('PERSONAL_ACCESS_TOKEN'),
'GITHUB_API_URL': get_ci_value('GITHUB_API_URL'),
'GITHUB_REPOSITORY': get_ci_value('GITHUB_REPOSITORY'),
'GITHUB_EVENT_PATH': get_ci_value('GITHUB_EVENT_PATH'),
'AWS_ACCESS_KEY_ID': get_ci_value('AWS_ACCESS_KEY_ID'),
'AWS_SECRET_ACCESS_KEY': get_ci_value('AWS_SECRET_ACCESS_KEY'),
'AWS_DEFAULT_REGION': get_ci_value('AWS_DEFAULT_REGION'),
'AZURE_CLIENT_ID': get_ci_value('AZURE_CLIENT_ID'),
'AZURE_CLIENT_SECRET': get_ci_value('AZURE_CLIENT_SECRET'),
'AZURE_TENANT_ID': get_ci_value('AZURE_TENANT_ID'),
'AZURE_SUBSCRIPTION_ID': get_ci_value('AZURE_SUBSCRIPTION_ID'),
'AZURE_DEFAULT_REGION': get_ci_value('AZURE_DEFAULT_REGION'),
'AZURE_RESOURCE_GROUP': get_ci_value('AZURE_RESOURCE_GROUP'),
'AZURE_CI_SUBNET_ID': get_ci_value('AZURE_CI_SUBNET_ID'),
'AZURE_CI_NSG_ID': get_ci_value('AZURE_CI_NSG_ID'),
'FIRESIM_PEM': get_ci_value('FIRESIM_PEM'),
'FIRESIM_PEM_PUBLIC': get_ci_value('FIRESIM_PEM_PUBLIC'),
}
54 changes: 5 additions & 49 deletions .github/scripts/common.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
import math
from fabric.api import *
import requests
from ci_variables import ci_gha_api_url, ci_repo_name, ci_firesim_dir

from typing import Dict, List, Any
from typing import Dict

from platform_lib import PlatformLib, AWSPlatformLib, AzurePlatformLib, Platform

# Github URL related constants
gha_api_url = f"{ci_gha_api_url}/repos/{ci_repo_name}/actions"
gha_runners_api_url = f"{gha_api_url}/runners"
gha_runs_api_url = f"{gha_api_url}/runs"
from ci_variables import ci_env
from github_common import deregister_runners

# Remote paths
manager_home_dir = "/home/centos"
manager_fsim_pem = manager_home_dir + "/firesim.pem"
manager_fsim_dir = ci_firesim_dir
manager_fsim_dir = ci_env['MANAGER_FIRESIM_LOCATION']
manager_marshal_dir = manager_fsim_dir + "/sw/firesim-software"
manager_ci_dir = manager_fsim_dir + "/.github/scripts"

Expand All @@ -27,48 +22,9 @@
env.disable_known_hosts = True
env.keepalive = 60 # keep long SSH connections running

def set_fabric_firesim_pem():
def set_fabric_firesim_pem() -> None:
env.key_filename = manager_fsim_pem

def get_header(gh_token: str) -> Dict[str, str]:
return {"Authorization": f"token {gh_token.strip()}", "Accept": "application/vnd.github+json"}

def get_runners(gh_token: str) -> List:
r = requests.get(gha_runners_api_url, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception(f"Unable to retrieve count of GitHub Actions Runners\nFull Response Below:\n{r}")
res_dict = r.json()
runner_count = res_dict["total_count"]

runners = []
for page_idx in range(math.ceil(runner_count / 30)):
r = requests.get(gha_runners_api_url, params={"per_page" : 30, "page" : page_idx + 1}, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception(f"Unable to retrieve (sub)list of GitHub Actions Runners\nFull Response Below\n{r}")
res_dict = r.json()
runners = runners + res_dict["runners"]

return runners

def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool:
r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=get_header(gh_token))
if r.status_code != 204:
print(f"""Unable to delete runner {runner["name"]} with id: {runner["id"]}\nFull Response Below\n{r}""")
return False
return True

def deregister_offline_runners(gh_token: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner["status"] == "offline":
delete_runner(gh_token, runner)

def deregister_runners(gh_token: str, runner_name: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner_name in runner["name"]:
delete_runner(gh_token, runner)

aws_platform_lib = AWSPlatformLib(deregister_runners)
#azure_platform_lib = AzurePlatformLib(deregister_runners)

Expand Down
62 changes: 35 additions & 27 deletions .github/scripts/cull-old-ci-instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,61 +4,69 @@
# instances that have exceeded a lifetime limit

import datetime
from typing import Iterable, Tuple, Any
from xmlrpc.client import DateTime
import pytz
import boto3
import sys
from platform_lib import Platform
from common import deregister_runners, get_platform_lib

from platform_lib import Platform, find_timed_out_resources
from common import get_platform_lib
from github_common import deregister_runners

# Reuse manager utilities
from ci_variables import ci_workdir, ci_personal_api_token, ci_workflow_run_id, ci_azure_sub_id
sys.path.append(ci_workdir + "/deploy")
from ci_variables import ci_env
sys.path.append(ci_env['GITHUB_WORKSPACE'] + "/deploy")

# The number of hours an instance may exist since its initial launch time
# The number of hours a manager instance may exist since its initial launch time
INSTANCE_LIFETIME_LIMIT_HOURS = 8

def find_timed_out_resources(current_time: DateTime, resource_list: Iterable[Tuple]) -> list:
"""
Because of the differences in how AWS and Azure store time tags, the resource_list
in this case is a list of tuples with the 0 index being the instance/vm and the 1 index
a datetime object corresponding to the time
"""
timed_out = []
for resource_tuple in resource_list:
lifetime_secs = (current_time - resource_tuple[1]).total_seconds()
if lifetime_secs > (INSTANCE_LIFETIME_LIMIT_HOURS * 3600):
timed_out.append(resource_tuple[0])
return timed_out
# The number of hours a fpga instance may exist since its initial launch time
FPGA_INSTANCE_LIFETIME_LIMIT_HOURS = 1

def cull_aws_instances(current_time: DateTime) -> None:
# Grab all instances with a CI-generated tag
aws_platform_lib = get_platform_lib(Platform.AWS)
all_ci_instances = aws_platform_lib.find_all_ci_instances()

client = boto3.client('ec2')

instances_to_terminate = find_timed_out_resources(current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
run_farm_ci_instances = aws_platform_lib.find_run_farm_ci_instances()
instances_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, map(lambda x: (x, x['LaunchTime']), run_farm_ci_instances))
run_farm_instances_to_terminate = list(set(instances_to_terminate))
print("Terminated Run Farm Instances:")
for inst in run_farm_instances_to_terminate:
client.terminate_instances(InstanceIds=[inst['InstanceId']])
print(" " + inst['InstanceId'])

print("Terminated Instances:")
for inst in instances_to_terminate:
deregister_runners(ci_personal_api_token, f"aws-{ci_workflow_run_id}")
all_ci_instances = aws_platform_lib.find_all_ci_instances()
instances_to_terminate = find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
manager_instances_to_terminate = list(set(instances_to_terminate))
print("Terminated Manager Instances:")
for inst in manager_instances_to_terminate:
deregister_runners(ci_env['PERSONAL_ACCESS_TOKEN'], f"aws-{ci_env['GITHUB_RUN_ID']}")
client.terminate_instances(InstanceIds=[inst['InstanceId']])
print(" " + inst['InstanceId'])

if len(manager_instances_to_terminate) > 0 or len(run_farm_instances_to_terminate) > 0:
exit(1)

def cull_azure_resources(current_time: DateTime) -> None:
azure_platform_lib = get_platform_lib(Platform.AZURE)
all_azure_ci_vms = azure_platform_lib.find_all_ci_instances()
run_farm_azure_ci_vms = azure_platform_lib.find_run_farm_ci_instances()

vms_to_terminate = find_timed_out_resources(current_time, \
vms_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, \
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), run_farm_azure_ci_vms))
vms_to_terminate += find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, \
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), all_azure_ci_vms))
vms_to_terminate = list(set(vms_to_terminate))

print("VMs:")
print("Terminated VMs:")
for vm in vms_to_terminate:
deregister_runners(ci_personal_api_token, f"azure-{ci_workflow_run_id}")
deregister_runners(ci_env['PERSONAL_ACCESS_TOKEN'], f"azure-{ci_env['GITHUB_RUN_ID']}")
azure_platform_lib.terminate_azure_vms([vm]) #prints are handled in here

if len(vms_to_terminate) > 0:
exit(1)

def main():
# Get a timezone-aware datetime instance
current_time = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
Expand Down
6 changes: 3 additions & 3 deletions .github/scripts/cull-old-ci-runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
# Runs periodically in it's own workflow in the CI/CD environment to teardown
# runners that are offline

from common import deregister_offline_runners
from github_common import deregister_offline_runners

# Reuse manager utilities
from ci_variables import ci_personal_api_token
from ci_variables import ci_env

def main():
# deregister all offline runners
deregister_offline_runners(ci_personal_api_token)
deregister_offline_runners(ci_env['PERSONAL_ACCESS_TOKEN'])

if __name__ == "__main__":
main()

0 comments on commit 35f4f5a

Please sign in to comment.