In [85]:
import time
import json
import yaml
import boto3
import base64
import logging
import paramiko
from scp import SCPClient
from utils import *
from botocore.exceptions import NoCredentialsError, ClientError

instance_id_list = []


In [3]:
def load_yaml_file(file_path):
    """
    Load and parse a YAML file.

    Args:
        file_path (str): The path to the YAML file to be read.

    Returns:
        dict: Parsed content of the YAML file as a dictionary.
    """
    with open(file_path, "r") as file:
        try:
            data = yaml.safe_load(file)
            return data
        except yaml.YAMLError as error:
            print(f"Error reading the YAML file: {error}")
            return None


In [4]:
yaml_file_path = "config.yml"
config_data = load_yaml_file(yaml_file_path)


In [5]:
print(config_data)


{'aws': {'region': 'us-east-1', 'iam_instance_profile_arn': 'arn:aws:iam::471112568442:instance-profile/EC2'}, 'run_steps': {'security_group_creation': True, 'key_pair_generation': False, 'deploy_ec2_instance': True, 'delete_ec2_instance': True}, 'security_group': {'group_name': 'ec2_multi_deploy', 'description': 'MultiDeploy EC2 Security Group', 'vpc_id': None}, 'key_pair_gen': {'key_pair_name': 'ec2_multi_deploy_kp', 'key_pair_fpath': 'ec2_multi_deploy_kp.pem'}, 'instances': [{'instance_type': 'g5.2xlarge', 'ami_id': 'ami-05c3e698bd0cffe7e', 'startup_script': 'startup_scripts/gpu_ubuntu_startup.txt', 'command_to_run': 'fmbench --config-file src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml --local-mode yes --write-bucket placeholder --tmp-dir /tmp > fmbench.log 2>&1'}]}


In [38]:
# Define a dictionary for common AMIs and their corresponding usernames
AMI_USERNAME_MAP = {
    'ami-': 'ec2-user',    # Amazon Linux AMIs start with 'ami-'
    'ubuntu': 'ubuntu',    # Ubuntu AMIs contain 'ubuntu' in their name
}


In [12]:
if config_data["run_steps"]["security_group_creation"]:
    GROUP_NAME = config_data["security_group"].get("group_name")
    DESCRIPTION = config_data["security_group"].get("description", " ")
    VPC_ID = config_data["security_group"].get("vpc_id", "")
    try:
        sg_id = create_security_group(GROUP_NAME, DESCRIPTION, VPC_ID)

        if sg_id:
            # Add inbound rules if security group was created successfully
            authorize_inbound_rules(sg_id)
    except ClientError as e:
        print(f"An error occurred while creating or getting the security group: {e}")


Security Group 'ec2_multi_deploy' already exists. Fetching existing security group ID.
Inbound rule already exists for Security Group sg-013c79f5ffe82fb52. Skipping...


In [13]:
if config_data["run_steps"]["key_pair_generation"]:
    KEY_PAIR_NAME = config_data["key_pair_gen"]["key_pair_name"]
    private_key = create_key_pair(KEY_PAIR_NAME)
elif config_data["run_steps"]["key_pair_generation"] == False:
    KEY_PAIR_NAME = config_data["key_pair_gen"]["key_pair_name"]
    PRIVATE_KEY_FNAME = config_data["key_pair_gen"]["key_pair_fpath"]
    try:
        with open(f"{PRIVATE_KEY_FNAME}", "r") as file:
            private_key = file.read()
    except FileNotFoundError:
        print(f"File not found: {PRIVATE_KEY_FNAME}")
    except IOError as e:
        print(f"Error reading file {PRIVATE_KEY_FNAME}: {e}")


In [14]:
for i in config_data["instances"]:
    print(i)


{'instance_type': 'g5.2xlarge', 'ami_id': 'ami-05c3e698bd0cffe7e', 'startup_script': 'startup_scripts/gpu_ubuntu_startup.txt', 'command_to_run': 'fmbench --config-file src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml --local-mode yes --write-bucket placeholder --tmp-dir /tmp > fmbench.log 2>&1'}


In [66]:
if config_data["run_steps"]["deploy_ec2_instance"]:
    iam_arn = config_data["aws"]["iam_instance_profile_arn"]
    print(iam_arn)
    # WIP Parallelize This.
    for instance in config_data["instances"]:
        instance_type = instance["instance_type"]
        ami_id = instance["ami_id"]
        startup_script = instance["startup_script"]
        command_to_run = instance["command_to_run"]
        with open(f"{startup_script}", "r") as file:
            user_data_script = file.read()
        # user_data_script += command_to_run
        # Create an EC2 instance with the user data script
        instance_id = create_ec2_instance(
            KEY_PAIR_NAME,
            sg_id,
            user_data_script,
            ami_id,
            instance_type,
            iam_arn,
            
        )
        instance_id_list.append(instance_id)
    
    
    # check_and_retrieve_results_folders(instance_details, PRIVATE_KEY_FNAME, "results-output")


arn:aws:iam::471112568442:instance-profile/EC2
EC2 Instance 'i-04c4c6a522a4e97c1' created successfully with user data.


In [69]:
instance_details = generate_instance_details(instance_id_list, PRIVATE_KEY_FNAME)


In [70]:
instance_details


[{'hostname': 'ec2-54-166-132-170.compute-1.amazonaws.com',
  'username': 'ubuntu',
  'key_file_path': 'ec2_multi_deploy_kp.pem'}]

In [82]:
command = '''cd foundation-model-benchmarking-tool && 
            conda init &&
            source ~/.bashrc &&
            conda activate fmbench_python311 && 
            fmbench --config-file src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml --local-mode yes --write-bucket placeholder --tmp-dir /tmp > fmbench.log 2>&1'''
results = run_command_on_instances(instance_details, PRIVATE_KEY_FNAME, command)
for hostname, result in results.items():
    print(f"Results for {hostname}:")
    print(f"Command Output:\n{result['stdout']}")
    if result['stderr']:
        print(f"Command Error:\n{result['stderr']}")
    print(f"Exit Status: {result['exit_status']}")


Running command on ec2-54-166-132-170.compute-1.amazonaws.com as ubuntu...
Connected to ec2-54-166-132-170.compute-1.amazonaws.com as ubuntu
Results for ec2-54-166-132-170.compute-1.amazonaws.com:
Command Output:
no change     /opt/conda/condabin/conda
no change     /opt/conda/bin/conda
no change     /opt/conda/bin/conda-env
no change     /opt/conda/bin/activate
no change     /opt/conda/bin/deactivate
no change     /opt/conda/etc/profile.d/conda.sh
no change     /opt/conda/etc/fish/conf.d/conda.fish
no change     /opt/conda/shell/condabin/Conda.psm1
no change     /opt/conda/shell/condabin/conda-hook.ps1
no change     /opt/conda/lib/python3.10/site-packages/xontrib/conda.xsh
no change     /opt/conda/etc/profile.d/conda.csh
no change     /home/ubuntu/.bashrc
No action taken.

Command Error:

CondaError: Run 'conda init' before 'conda activate'


Exit Status: 1


In [18]:
instance_id_list


[]

In [49]:
check_and_retrieve_results_folders(instance_details, PRIVATE_KEY_FNAME, "results-output")


Connected to ec2-3-239-112-152.compute-1.amazonaws.com as ubuntu
No matching folders found on ec2-3-239-112-152.compute-1.amazonaws.com: ls: cannot access '/home/ubuntu/foundation-model-benchmarking-tool/results-*': No such file or directory


In [84]:
if config_data["run_steps"]["delete_ec2_instance"]:
    for instance_id in instance_id_list:
        delete_ec2_instance(instance_id)
    instance_id_list = []
