In [30]:
import time
import json
import yaml
import boto3
import base64
import logging
import paramiko
from utils import *
from scp import SCPClient
from collections import defaultdict
from botocore.exceptions import NoCredentialsError, ClientError

instance_id_list = []
fmbench_config_map = []


In [21]:
def load_yaml_file(file_path):
    """
    Load and parse a YAML file.

    Args:
        file_path (str): The path to the YAML file to be read.

    Returns:
        dict: Parsed content of the YAML file as a dictionary.
    """
    with open(file_path, "r") as file:
        try:
            data = yaml.safe_load(file)
            return data
        except yaml.YAMLError as error:
            print(f"Error reading the YAML file: {error}")
            return None


In [22]:
yaml_file_path = "config.yml"
config_data = load_yaml_file(yaml_file_path)


In [23]:
print(config_data)


{'aws': {'region': 'us-east-1', 'iam_instance_profile_arn': 'arn:aws:iam::471112568442:instance-profile/EC2'}, 'run_steps': {'security_group_creation': True, 'key_pair_generation': False, 'deploy_ec2_instance': True, 'delete_ec2_instance': True}, 'security_group': {'group_name': 'ec2_multi_deploy', 'description': 'MultiDeploy EC2 Security Group', 'vpc_id': None}, 'key_pair_gen': {'key_pair_name': 'ec2_multi_deploy_kp', 'key_pair_fpath': 'ec2_multi_deploy_kp.pem'}, 'instances': [{'instance_type': 'g5.2xlarge', 'ami_id': 'ami-05c3e698bd0cffe7e', 'startup_script': 'startup_scripts/gpu_ubuntu_startup.txt', 'fmbench_config': 'src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml'}]}


In [25]:
# Define a dictionary for common AMIs and their corresponding usernames
AMI_USERNAME_MAP = {
    "ami-": "ec2-user",  # Amazon Linux AMIs start with 'ami-'
    "ubuntu": "ubuntu",  # Ubuntu AMIs contain 'ubuntu' in their name
}


In [7]:
if config_data["run_steps"]["security_group_creation"]:
    GROUP_NAME = config_data["security_group"].get("group_name")
    DESCRIPTION = config_data["security_group"].get("description", " ")
    VPC_ID = config_data["security_group"].get("vpc_id", "")
    try:
        sg_id = create_security_group(GROUP_NAME, DESCRIPTION, VPC_ID)

        if sg_id:
            # Add inbound rules if security group was created successfully
            authorize_inbound_rules(sg_id)
    except ClientError as e:
        print(f"An error occurred while creating or getting the security group: {e}")


Security Group 'ec2_multi_deploy' already exists. Fetching existing security group ID.
Inbound rule already exists for Security Group sg-013c79f5ffe82fb52. Skipping...


In [8]:
if config_data["run_steps"]["key_pair_generation"]:
    KEY_PAIR_NAME = config_data["key_pair_gen"]["key_pair_name"]
    private_key = create_key_pair(KEY_PAIR_NAME)
elif config_data["run_steps"]["key_pair_generation"] == False:
    KEY_PAIR_NAME = config_data["key_pair_gen"]["key_pair_name"]
    PRIVATE_KEY_FNAME = config_data["key_pair_gen"]["key_pair_fpath"]
    try:
        with open(f"{PRIVATE_KEY_FNAME}", "r") as file:
            private_key = file.read()
    except FileNotFoundError:
        print(f"File not found: {PRIVATE_KEY_FNAME}")
    except IOError as e:
        print(f"Error reading file {PRIVATE_KEY_FNAME}: {e}")


In [31]:
for i in config_data["instances"]:
    print(i)


{'instance_type': 'g5.2xlarge', 'ami_id': 'ami-05c3e698bd0cffe7e', 'startup_script': 'startup_scripts/gpu_ubuntu_startup.txt', 'fmbench_config': 'src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml'}


In [32]:
if config_data["run_steps"]["deploy_ec2_instance"]:
    iam_arn = config_data["aws"]["iam_instance_profile_arn"]
    print(iam_arn)
    # WIP Parallelize This.
    for instance in config_data["instances"]:
        instance_type = instance["instance_type"]
        ami_id = instance["ami_id"]
        startup_script = instance["startup_script"]
        # command_to_run = instance["command_to_run"]
        with open(f"{startup_script}", "r") as file:
            user_data_script = file.read()
        # user_data_script += command_to_run
        # Create an EC2 instance with the user data script
        instance_id = create_ec2_instance(
            KEY_PAIR_NAME,
            sg_id,
            user_data_script,
            ami_id,
            instance_type,
            iam_arn,
        )
        instance_id_list.append(instance_id)
        fmbench_config_map.append({instance_id: instance["fmbench_config"]})


arn:aws:iam::471112568442:instance-profile/EC2
EC2 Instance 'i-0e6aa43300da3a734' created successfully with user data.


In [33]:
instance_id_list


['i-0e6aa43300da3a734']

In [34]:
fmbench_config_map


[{'i-0e6aa43300da3a734': 'src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml'}]

In [35]:
instance_details = generate_instance_details(
    instance_id_list, PRIVATE_KEY_FNAME, fmbench_config_map, region="us-east-1"
)


In [49]:
instance_details


[{'hostname': 'ec2-34-232-62-253.compute-1.amazonaws.com',
  'username': 'ubuntu',
  'key_file_path': 'ec2_multi_deploy_kp.pem',
  'config_file': 'src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml'}]

In [66]:
bash_script = """cd /home/ubuntu/foundation-model-benchmarking-tool;
                source activate fmbench_python311;
                 . ~/.bashrc;
                if [[ "$CONDA_DEFAULT_ENV" == "fmbench_python311" ]]; then
                    echo "The current environment is fmbench_python311. Running FMBench..."
                    nohup fmbench --config-file {config_file} --local-mode yes --write-bucket placeholder --tmp-dir /tmp > fmbench.log 2>&1 &
                    echo "FMBench execution completed."
                else
                    echo "Error: The current environment is not fmbench_python311. Exiting."
                    exit 1
                fi
                """


In [67]:
def upload_and_execute_script_invoke_shell(
    hostname, username, key_file_path, script_content, remote_script_path
):
    """
    Uploads a bash script to the EC2 instance and executes it via an interactive SSH shell.

    Args:
        hostname (str): The public IP or DNS of the EC2 instance.
        username (str): The SSH username (e.g., 'ubuntu').
        key_file_path (str): The path to the PEM key file.
        script_content (str): The content of the bash script to upload.
        remote_script_path (str): The remote path where the script should be saved on the instance.

    Returns:
        str: The output of the executed script.
    """
    try:
        # Initialize the SSH client
        ssh_client = paramiko.SSHClient()
        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

        # Load the private key
        private_key = paramiko.RSAKey.from_private_key_file(key_file_path)

        # Connect to the instance
        ssh_client.connect(hostname, username=username, pkey=private_key)
        print(f"Connected to {hostname} as {username}")

        # Open SFTP session to upload the script
        sftp = ssh_client.open_sftp()
        with sftp.open(remote_script_path, "w") as remote_file:
            remote_file.write(script_content)
        sftp.close()
        print(f"Script uploaded to {remote_script_path}")

        # Open an interactive shell session
        shell = ssh_client.invoke_shell()
        time.sleep(1)  # Give the shell some time to initialize

        # Send the commands to the shell
        shell.send(f"chmod +x {remote_script_path}\n")  # Make the script executable
        time.sleep(1)  # Wait for the command to complete

        # Execute the script
        shell.send(f"bash -l -c '{remote_script_path}'\n")
        time.sleep(1)  # Wait for the command to complete

        # Read the output of the script
        output = ""
        while shell.recv_ready():
            output += shell.recv(1024).decode("utf-8")
            time.sleep(2)  # Allow time for the command output to be captured

        # Close the shell and connection
        shell.close()
        ssh_client.close()

        return output

    except Exception as e:
        print(f"Error connecting via SSH to {hostname}: {e}")
        return None


In [68]:
for instance in instance_details:
    remote_script_path = "/home/ubuntu/run_fmbench.sh"

    # Format the script with the specific config file
    formatted_script = bash_script.format(config_file=instance["config_file"])

    # Upload and execute the script on the instance
    script_output = upload_and_execute_script_invoke_shell(
        hostname=instance["hostname"],
        username=instance["username"],
        key_file_path=instance["key_file_path"],
        script_content=formatted_script,
        remote_script_path=remote_script_path,
    )


Connected to ec2-34-232-62-253.compute-1.amazonaws.com as ubuntu
Script uploaded to /home/ubuntu/run_fmbench.sh


In [None]:
results = run_command_on_instances(instance_details, PRIVATE_KEY_FNAME, command)
for hostname, result in results.items():
    print(f"Results for {hostname}:")
    print(f"Command Output:\n{result['stdout']}")
    if result["stderr"]:
        print(f"Command Error:\n{result['stderr']}")
    print(f"Exit Status: {result['exit_status']}")


In [None]:
check_and_retrieve_results_folders(
    instance_details, PRIVATE_KEY_FNAME, "results-output"
)


In [13]:
if config_data["run_steps"]["delete_ec2_instance"]:
    for instance_id in instance_id_list:
        delete_ec2_instance(instance_id)
    instance_id_list = []
