In [2]:
import boto3
import re
import os
import json
import time
import base64

In [3]:
# establish clients
batch_client = boto3.client('batch')
ec2_client = boto3.client("ec2")
batch_client = boto3.client("batch")

# setup keypair for instance access
ec2KeyPair = "phoenixlogan"

In [30]:
# set up aws resource names
launch_template = "phoenixlogan-template-ska-2"
compute_environment = "phoenixlogan-compute"
job_queue = "phoenixlogan-queue"
job_definition = "phoenixlogan-ska-def-2"

# setup compute environment parameters 
root_volume_size = 2000
instance_types = ["optimal"]

# setup batch job parameters
job_image = 'phoenixajalogan/kmer-ska-batch'
job_vcpus = 15
job_memory = 40000


Creating job definition...
Finished creating job definition.


In [None]:
print("Creating launch template...")
with open("logs/launch_template.json", "w") as f:
    json.dump(
        ec2_client.create_launch_template(
            LaunchTemplateName=launch_template,
            LaunchTemplateData={
                "BlockDeviceMappings": [
                    {
                        "DeviceName": "/dev/xvda",
                        "Ebs": {
                            "DeleteOnTermination": True,
                            "VolumeSize": root_volume_size,
                            "VolumeType": "gp2"
                        }
                    }
                ],
            }
        ),
        f, default=str, indent=4
    )
print("Finished creating launch template.")

In [None]:
print("Creating compute environment...")
with open("logs/compute_environment.json", "w") as f:
    compute_resources = {
        'type': 'EC2',
        'minvCpus': 0,
        'maxvCpus': 256,
        'instanceTypes': instance_types,
        'subnets': [
            # subnets for us-west-2a, us-west-2b, us-west-2c
            "subnet-672e832e",
            "subnet-04119a63",
            "subnet-4347451b",
        ],
        'securityGroupIds': [
            'sg-3195a049',
        ],
        "ec2KeyPair": ec2KeyPair,
        'instanceRole': 'ecsInstanceRole2',
        'bidPercentage': 100,
        'spotIamFleetRole': 'arn:aws:iam::423543210473:role/aws-ec2-spot-fleet-role',
        'launchTemplate': {
            'launchTemplateName': launch_template
        }
    }
    json.dump(
        batch_client.create_compute_environment(
            computeEnvironmentName=compute_environment,
            type='MANAGED',
            state='ENABLED',
            computeResources=compute_resources,
            serviceRole='arn:aws:iam::423543210473:role/AWSBatchServiceRole'
        ),
        f, default=str, indent=4
    )
print("Finished creating compute environment.")

In [None]:
print("Creating job queue...")
n_tries = 5
sleep_time = 30
for i in range(n_tries):
    desc, = batch_client.describe_compute_environments(
        computeEnvironments=[compute_environment]
    )["computeEnvironments"]
    if desc['status'] != 'VALID':
        print("Waiting for compute environment...",
              f"(Try {i+1}/{n_tries})")
        time.sleep(sleep_time)
    else:
        break
with open("logs/job_queue.json", "w") as f:
    json.dump(
        batch_client.create_job_queue(
            jobQueueName=job_queue,
            state='ENABLED',
            priority=5,
            computeEnvironmentOrder=[
                {
                    'order': 5,
                    'computeEnvironment': compute_environment
                },
            ]
        ),
        f, default=str, indent=4
    )
print("Finished creating job queue.")



In [None]:
print("Creating job definition...")
with open("logs/job_definition.json", "w") as f:
    json.dump(
        batch_client.register_job_definition(
            jobDefinitionName="ska-sketch",
            type='container',
            containerProperties={
                "image": job_image,
                "vcpus": job_vcpus,
                "memory": job_memory,
                "command": [
                    "prep_files.py",
                    "Ref::bucket_name",
                    "Ref::key1",
                    "Ref::key2",
                    "Ref::size",
                    "Ref::outbucket"
                ],
                "volumes": [
                    {"host": {"sourcePath": "/scratch"},
                     "name": "scratch"},
                ],
                "mountPoints": [
                    {"containerPath": "/scratch",
                     "sourceVolume": "scratch"},
                ],
                "jobRoleArn": "arn:aws:iam::423543210473:role/simpleBatchJob",
                "privileged": True
            }
        ),
        f, default=str, indent=4
    )
print("Finished creating job definition.")

In [52]:
# def files_list(keys):
#     """
#     Args:
#         keys (lst): file prefixes to pass .

#     Returns:
#         list : all sample names to pass to bat
    
#     """
    
#     # download fastq files from S3
#     s3_resource = boto3.resource("s3")

#     seq_bucket_name = "czbiohub-mosquito"
#     seq_bucket_prefixes = ["sequences/CMS001_fastq.gz", "sequences/CMS002_fastq.gz"]
#     first_reads = re.compile(r".+(R1).+")

#     sample_names = []
#     seq_bucket = s3_resource.Bucket(seq_bucket_name)
#     for obj in seq_bucket.objects.filter(Prefix=seq_bucket_prefixes[1]):
#         matched = first_reads.match(os.path.basename(obj.key))
#         if matched:
#             sample_names.append(matched.group(0))
            
#     return sample_names

# s = files_list(["sequences/CMS001_fastq.gz", "sequences/CMS002_fastq.gz"])

In [54]:
def files_list(keys):
    """
    Args:
        keys (lst): file prefixes to pass .

    Returns:
        list : all sample names to pass to batch
    
    """
    
    # download fastq files from S3
    s3_resource = boto3.resource("s3")
    seq_bucket_name = "czbiohub-mosquito"
    #seq_bucket_prefixes = 
    first_reads = re.compile(r".+(R1).+")
    
    sample_names = []
    
    for key in keys:
        print(key)
        seq_bucket = s3_resource.Bucket(seq_bucket_name)
        for obj in seq_bucket.objects.filter(Prefix=key):
            matched = first_reads.match(os.path.basename(obj.key))
            if matched:
                sample_names.append(matched.group(0))
                
    return sample_names
    
samples = files_list(["sequences/CMS001_fastq.gz", "sequences/CMS002_fastq.gz"])

sequences/CMS001_fastq.gz
sequences/CMS002_fastq.gz


In [45]:
# load compute and queue arns to map to job submission
with open("logs/job_definition.json") as f:
    jobDefinition = json.load(f)["jobDefinitionArn"]

with open("logs/job_queue.json") as f:
    jobQueue = json.load(f)["jobQueueArn"]
    
# submit batch jobs 
for sample in sample_names:
    s3_fq = f"{seq_bucket_prefixes[1]}/{sample}"
    print(s3_fq)
    response = batch_client.submit_job(
        jobName=sample.replace(".fastq.gz", ""),
        jobQueue=jobQueue,
        jobDefinition=jobDefinition,
        parameters={
            "bucket_name": seq_bucket_name,
            "key1": s3_fq,
            "key2": s3_fq.replace("R1", "R2"),
            "size": "50000",
            "outbucket": "phoenixlogan-ska-sketches"
        }
    )
    prefix = sample.replace(".fastq.gz", "")
    with open(f"logs/jobs/{prefix}.json", "w") as f:
        json.dump(response, f, indent=4, default=str)

sequences/CMS002_fastq.gz/CMS_002_17b_Rb_S123_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_17c_Rb_S124_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_17d_Rb_S125_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_17e_Rb_S126_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_18a_Rb_S128_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_18b_Rb_S129_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_19a_Rb_S130_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_1a_Rb_S116_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_20a_Rb_S131_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_20b_Rb_S132_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_20c_Rb_S133_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_20d_Rb_S134_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_20e_Rb_S135_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_21a_Rb_S136_L004_R1_001.fastq.gz
sequences/CMS002_fastq.gz/CMS_002_22a_Rb_S137_L004_R1_001.fastq