Skip to content

Commit

Permalink
Consolidate CI time handling
Browse files Browse the repository at this point in the history
  • Loading branch information
abejgonzalez committed Dec 9, 2022
1 parent 7015094 commit 36d8b7e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 37 deletions.
41 changes: 16 additions & 25 deletions .github/scripts/cull-old-ci-instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
# instances that have exceeded a lifetime limit

import datetime
from typing import Iterable, Tuple, Any
from xmlrpc.client import DateTime
import pytz
import boto3
import sys

from platform_lib import Platform
from platform_lib import Platform, find_timed_out_resources
from common import get_platform_lib
from github_common import deregister_runners

Expand All @@ -23,48 +22,40 @@
# The number of hours a fpga instance may exist since its initial launch time
FPGA_INSTANCE_LIFETIME_LIMIT_HOURS = 1

def find_timed_out_resources(hr_limit: int, current_time: DateTime, resource_list: Iterable[Tuple]) -> list:
"""
Because of the differences in how AWS and Azure store time tags, the resource_list
in this case is a list of tuples with the 0 index being the instance/vm and the 1 index
a datetime object corresponding to the time
"""
timed_out = []
for resource_tuple in resource_list:
lifetime_secs = (current_time - resource_tuple[1]).total_seconds()
if lifetime_secs > (hr_limit * 3600):
timed_out.append(resource_tuple[0])
return timed_out

def cull_aws_instances(current_time: DateTime) -> None:
# Grab all instances with a CI-generated tag
aws_platform_lib = get_platform_lib(Platform.AWS)
all_ci_instances = aws_platform_lib.find_all_ci_instances()
run_farm_ci_instances = aws_platform_lib.find_run_farm_ci_instances()

client = boto3.client('ec2')

instances_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS, current_time, map(lambda x: (x, x['LaunchTime']), run_farm_ci_instances))
instances_to_terminate += find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS, current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
instances_to_terminate = list(set(instances_to_terminate))
run_farm_ci_instances = aws_platform_lib.find_run_farm_ci_instances()
instances_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, map(lambda x: (x, x['LaunchTime']), run_farm_ci_instances))
run_farm_instances_to_terminate = list(set(instances_to_terminate))
print("Terminated Run Farm Instances:")
for inst in run_farm_instances_to_terminate:
client.terminate_instances(InstanceIds=[inst['InstanceId']])
print(" " + inst['InstanceId'])

print("Terminated Instances:")
for inst in instances_to_terminate:
all_ci_instances = aws_platform_lib.find_all_ci_instances()
instances_to_terminate = find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
manager_instances_to_terminate = list(set(instances_to_terminate))
print("Terminated Manager Instances:")
for inst in manager_instances_to_terminate:
deregister_runners(ci_env['PERSONAL_ACCESS_TOKEN'], f"aws-{ci_env['GITHUB_RUN_ID']}")
client.terminate_instances(InstanceIds=[inst['InstanceId']])
print(" " + inst['InstanceId'])

if len(instances_to_terminate) > 0:
if len(manager_instances_to_terminate) > 0 or len(run_farm_instances_to_terminate) > 0:
exit(1)

def cull_azure_resources(current_time: DateTime) -> None:
azure_platform_lib = get_platform_lib(Platform.AZURE)
all_azure_ci_vms = azure_platform_lib.find_all_ci_instances()
run_farm_azure_ci_vms = azure_platform_lib.find_run_farm_ci_instances()

vms_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS, current_time, \
vms_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, \
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), run_farm_azure_ci_vms))
vms_to_terminate += find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS, current_time, \
vms_to_terminate += find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS * 60, current_time, \
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), all_azure_ci_vms))
vms_to_terminate = list(set(vms_to_terminate))

Expand Down
37 changes: 25 additions & 12 deletions .github/scripts/platform_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytz
import datetime
import requests
from xmlrpc.client import DateTime

from ci_variables import ci_env
from github_common import issue_post
Expand All @@ -17,7 +18,7 @@
from azure.mgmt.compute import ComputeManagementClient
import azure.mgmt.resourcegraph as arg

from typing import Any, Callable, Dict, List
from typing import Any, Callable, Dict, List, Iterable, Tuple

# Reuse manager utilities
# Note: GITHUB_WORKSPACE must not be used here because the persistent clone my not be initialized yet.
Expand Down Expand Up @@ -51,6 +52,19 @@ def get_platform_enum(platform_string: str) -> Platform:
else:
raise Exception(f"Invalid platform string: '{platform_string}'")

def find_timed_out_resources(min_timeout: int, current_time: DateTime, resource_list: Iterable[Tuple]) -> list:
"""
Because of the differences in how AWS and Azure store time tags, the resource_list
in this case is a list of tuples with the 0 index being the instance/vm and the 1 index
a datetime object corresponding to the time
"""
timed_out = []
for resource_tuple in resource_list:
lifetime_secs = (current_time - resource_tuple[1]).total_seconds()
if lifetime_secs > (min_timeout * 60):
timed_out.append(resource_tuple[0])
return timed_out

class PlatformLib(metaclass=abc.ABCMeta):
"""
This is a class hierarchy to support multiple platforms in FireSim CI
Expand Down Expand Up @@ -273,21 +287,20 @@ def check_and_terminate_run_farm_instances(self, timeout: int, workflow_tag: str
self.client = boto3.client('ec2')

instances = self.find_run_farm_ci_instances(workflow_tag)
terminated_insts = False
for inst in instances:
if (datetime.datetime.now() - inst['LaunchTime']) >= datetime.timedelta(minutes=timeout):
print("Uncaught run farm instance shutdown detected")

instids = [ inst.instance_id ]
self.client.terminate_instances(InstanceIds=instids, DryRun=False)
instances_to_terminate = find_timed_out_resources(
timeout,
datetime.datetime.utcnow().replace(tzinfo=pytz.UTC),
map(lambda x: (x, x['LaunchTime']), instances))

print(f"Terminated run farm instance {instids}")
terminated_insts = True
for inst in instances_to_terminate:
print("Uncaught run farm instance shutdown detected")
self.client.terminate_instances(InstanceIds=[inst['InstanceId']])
print(f"Terminated run farm instance {inst['InstanceId']}")

# post comment after instances are terminated just in case there is an issue with posting
if terminated_insts:
if len(instances_to_terminate) > 0:
issue_post(ci_env['PERSONAL_ACCESS_TOKEN'],
f"Uncaught FPGA instance shutdown detected for CI run: {ci_env['GITHUB_RUN_ID']}. Verify CI state before submitting PR.")
f"Uncaught {len(instances_to_terminate)} FPGA instance shutdown(s) detected for CI run: {ci_env['GITHUB_RUN_ID']}. Verify CI state before submitting PR.")


class AzurePlatformLib(PlatformLib):
Expand Down

0 comments on commit 36d8b7e

Please sign in to comment.