In [161]:
import json
import requests
from collections import namedtuple

import boto3

In [37]:
# make sure you set the appropriate AWS_PROFILE in terminal before launching notebook

# Initialize the Step Functions, s3, and CloudWatch Logs clients
sfn_client = boto3.client("stepfunctions")
s3_client = boto3.client('s3')
logs_client = boto3.client('logs')


In [167]:
def get_bucket_and_key(s3_path):
    # get the path to the non_host alignment log
    s3_path_parts = s3_path.strip("s3://").split("/")
    bucket = s3_path_parts[0]
    key = "/".join(s3_path_parts[1::])
    
    return bucket, key


def count_chunks(index_s3_path):
    try:
        bucket, key = get_bucket_and_key(index_s3_path)
        # List objects within the bucket
        paginator = s3_client.get_paginator('list_objects_v2')
        contents = []
        for page in paginator.paginate(Bucket=bucket, Prefix=key):
            contents.append(page.get('Contents', []))
        
        unqiue_keys = set()
        for page in contents:
            for c in page:
                filename = c['Key'].split("/")[-1]
                unqiue_keys.add(filename)
                
        return len(unqiue_keys)

    except Exception as e:
        print(f"An error occurred: {e}")
        
    

def get_execution_alignment_times(execution_arn):
    # Get the execution history
    execution_history = sfn_client.get_execution_history(
        executionArn=execution_arn
    )
    
    # get the job inputs to get the s3 bucket
    event_input = json.loads(execution_history["events"][0]["executionStartedEventDetails"]["input"])
    s3_wd_uri = event_input["Input"]["NonHostAlignment"]["s3_wd_uri"]
    diamond_index_path = event_input["Input"]["NonHostAlignment"]["diamond_db"]
    minimap_index_path = event_input["Input"]["NonHostAlignment"]["minimap2_db"]
    
    # get the chunks for diamond and minimap
    diamond_chunks = count_chunks(diamond_index_path)
    minimap_chunks = count_chunks(minimap_index_path)
    
    # get the path to the non_host alignment log
    bucket_name, key = get_bucket_and_key(s3_wd_uri)
    key = f"{key}/non_host_alignment_status2.json"
    
    # Get the object from S3
    obj = s3_client.get_object(Bucket=bucket_name, Key=key)

    # Read the file's content
    file_content = json.loads(obj['Body'].read().decode('utf-8'))


    # get alignment times: 
    diamond_time_in_seconds = float(file_content["diamond_out"]["end_time"]) - float(file_content["diamond_out"]["start_time"])
    minimap_time_in_seconds = float(file_content["minimap2_out"]["end_time"]) - float(file_content["minimap2_out"]["start_time"])

    # get the time in minutes
    diamond_time_in_mins = diamond_time_in_seconds/60
    minimap_time_in_mins = minimap_time_in_seconds/60
    
    sfn_info = namedtuple("sfn_info", [
        "diamond_time_in_mins", 
        "minimap_time_in_mins", 
        "diamond_index_path", 
        "minimap_index_path", 
        "diamond_chunks", 
        "minimap_chunks"
    ])
    
    return sfn_info(
        diamond_time_in_mins, 
        minimap_time_in_mins, 
        diamond_index_path, 
        minimap_index_path, 
        diamond_chunks, 
        minimap_chunks
    )


In [132]:
# Replace with execution ARNs (right now these are stubbed)

execution_arn_no_compression = 'arn:aws:states:us-west-2:732052188396:execution:idseq-swipe-staging-short-read-mngs-wdl:idseq-staging-150-32175-35212-20240122120219'
execution_arn_w_compression = 'arn:aws:states:us-west-2:732052188396:execution:idseq-swipe-staging-short-read-mngs-wdl:idseq-staging-1166-32169-35209-20240122112059'

def compare_two_runs(run1_arn, run2_arn): # usually run1 is the compressed DB
    diamond_1, minimap_1, _, _, _, _ = get_execution_alignment_times(run1_arn)
    diamond_2, minimap_2, _, _, _, _ = get_execution_alignment_times(run2_arn)

    time_diff_diamond = diamond_1 / diamond_2
    time_diff_minimap = minimap_1 / minimap_2

    print(f"diamond: {time_diff_diamond}")
    print(f"minimap: {time_diff_minimap}")
    
    return time_diff_diamond, time_diff_minimap

# compare_two_runs(execution_arn_no_compression, execution_arn_w_compression)

In [124]:
cookies = {
    'Cookie': 'x',  
}

def get_execution_arn(sample_id):
    url = f'https://staging.czid.org/samples/{sample_id}/pipeline_runs'
    response = requests.get(url, cookies=cookies)
    if response.status_code == 200:
        sfn = [s for s in response.text.split(" ") if "arn:aws" in s][0]
        execution_arn = sfn.split(">")[1].rstrip("</a")
        return execution_arn
    response

In [126]:
def get_sample_name_to_id_for_project(project_id):
    project_url = f"https://staging.czid.org/samples/index_v2.json?projectId={project_id}&domain=all_data&offset=0&listAllIds=true&basic=false&workflow=short-read-mngs"
    response = requests.get(project_url, cookies=cookies)
    sample_json = response.json()
    
    sample_name_to_id = {
    x["name"]: x["id"]
    for x in sample_json["samples"]
    }
    
    return sample_name_to_id


In [165]:
sample_1_arn = get_execution_arn(29045)
sample_2_arn = get_execution_arn(28408)

print(sample_1_arn)
print(sample_2_arn)

sample_1_alignment_times = get_execution_alignment_times(sample_1_arn)
sample_2_alignment_times = get_execution_alignment_times(sample_2_arn)

In [154]:
def compare_projects(project_name_1, project_id_1, project_name_2, project_id_2):
    project_1_samples = get_sample_name_to_id_for_project(project_id_1)
    project_2_samples = get_sample_name_to_id_for_project(project_id_2)
    
    records = []
    # project 1 is the primary (usually compressed) dataset
    for name, i in project_1_samples.items():
        if name in project_2_samples.keys():
            try:
                sample_1_arn = get_execution_arn(i)
                sample_1_alignment_times = get_execution_alignment_times(sample_1_arn)
                sample_1_total_alignment_time_diamond = sample_1_alignment_times[0]* sample_1_alignment_times[4]
                sample_1_total_alignment_time_minimap = sample_1_alignment_times[1]* sample_1_alignment_times[5]
            except Exception as e:
                print(f"error getting {name} sample")
                print(e)
                sample_1_alignment_times = ["ARN DNE"]*6
                sample_1_total_alignment_time_diamond="NA"
                sample_1_total_alignment_time_minimap="NA"


            try:
                sample_2_arn = get_execution_arn(project_2_samples[name])
                sample_2_alignment_times = get_execution_alignment_times(sample_2_arn)
                sample_2_total_alignment_time_diamond = sample_2_alignment_times[0]* sample_2_alignment_times[4]
                sample_2_total_alignment_time_minimap = sample_2_alignment_times[1]* sample_2_alignment_times[5]
            except Exception as e:
                print(f"error getting {name} sample")
                print(e)
                sample_2_alignment_times = ["ARN DNE"]*6
                sample_2_total_alignment_time_diamond="NA"
                sample_2_total_alignment_time_minimap="NA"

            if ("ARN DNE" in sample_1_alignment_times or "ARN DNE" in sample_2_alignment_times):
                diff = ["UNABLE TO COMPARE, ARN DNE"]*2
            else:
                diff = compare_two_runs(sample_1_arn, sample_2_arn)

            records.append(
                [
                    project_name_1, 
                    project_name_2,
                    name,
                    sample_1_alignment_times.diamond_time_in_mins, 
                    sample_1_alignment_times.minimap_time_in_mins,
                    sample_1_alignment_times.diamond_index_path,
                    sample_1_alignment_times.minimap_index_path,
                    sample_1_alignment_times.diamond_chunks,
                    sample_1_alignment_times.minimap_chunks,
                    sample_1_total_alignment_time_diamond,
                    sample_1_total_alignment_time_minimap,
                    sample_2_alignment_times.diamond_time_in_mins,
                    sample_2_alignment_times.minimap_time_in_mins,
                    sample_2_alignment_times.diamond_index_path,
                    sample_2_alignment_times.minimap_index_path,
                    sample_2_alignment_times.diamond_chunks,
                    sample_2_alignment_times.minimap_chunks,
                    sample_2_total_alignment_time_diamond,
                    sample_2_total_alignment_time_minimap,
                    diff[0],
                    diff[1]
                ]
            )
            
    return records
        
    
records = compare_projects(
    "20210122_unclobbered_0-9_1000_31", 
    1125, 
    "NCBI DB Compression -- uncompressed NT-NR baseline", 
    1051
)

diamond: 0.5219304184739638
minimap: 0.5593416247014229
error getting UnAmbiguouslyMapped_ds.soil sample
An error occurred (ExecutionDoesNotExist) when calling the GetExecutionHistory operation: Execution Does Not Exist: 'arn:aws:states:us-west-2:732052188396:execution:idseq-swipe-staging-short-read-mngs-wdl:idseq-staging-1051-27094-31587-20230828135316'
error getting UnAmbiguouslyMapped_ds.nycsm sample
An error occurred (ExecutionDoesNotExist) when calling the GetExecutionHistory operation: Execution Does Not Exist: 'arn:aws:states:us-west-2:732052188396:execution:idseq-swipe-staging-short-read-mngs-wdl:idseq-staging-1051-27095-31583-20230828135254'
error getting UnAmbiguouslyMapped_ds.hous2 sample
An error occurred (ExecutionDoesNotExist) when calling the GetExecutionHistory operation: Execution Does Not Exist: 'arn:aws:states:us-west-2:732052188396:execution:idseq-swipe-staging-short-read-mngs-wdl:idseq-staging-1051-27096-31581-20230828135251'
error getting UnAmbiguouslyMapped_ds.ho

In [156]:
import csv

# Specify the filename
filename = "alignment_time_differences_in_minutes.csv"

# Writing to csv file
with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow([
        "project_name_1", 
        "project_name_2",
        "sample_name",
        "sample_1_alignment_time_diamond", 
        "sample_1_alignment_times_minimap",
        "sample_1_diamond_path",
        "sample_1_minimap_path",
        "sample_1_diamond_chunk_count",
        "sample_1_minimap_chunk_count",
        "sample_1_total_alignment_time_diamond",
        "sample_1_total_alignment_time_minimap",
        "sample_2_alignment_times_diamond",
        "sample_2_alignment_times_minimap",
        "sample_2_diamond_path",
        "sample_2_minimap_path",
        "sample_2_diamond_chunk_count",
        "sample_2_minimap_chunk_count",
        "sample_2_total_alignment_time_diamond",
        "sample_2_total_alignment_time_minimap",
        "time_diff_diamond",
        "time_diff_minimap"
    ])  
    # Writing the data into the file
    csvwriter.writerows(records)


In [160]:
import pandas as pd

c = pd.read_csv("alignment_time_differences_in_minutes.csv")
c.head()

c[
    [
        "project_name_1", 
        "project_name_2", 
        "sample_1_total_alignment_time_minimap", 
        "sample_2_total_alignment_time_minimap",
        "sample_1_total_alignment_time_diamond",
        "sample_2_total_alignment_time_diamond"
    ]
].to_csv("total_alignment_times_in_mins.csv")