In [2]:
import pandas as pd
import numpy as np

paths_df = pd.read_csv('s3://daniel.le-work/MEL_project/DL20190110_subset2_paths.csv')
paths_vec = paths_df.path.values.tolist()
len(paths_vec)


3015

In [2]:
import time

for i in range(10):
    start_time = time.time()
    time.sleep(0.1)
    etime = time.time() - start_time
    with open('/home/ubuntu/data/DL20181011_melanocyte_test_data/DL20190110_outrigger_timelog.txt', 'a') as f:
        f.write(f'{etime}\n')
        

In [None]:
# must establish outrigger environment
# create a docker image to run outrigger

from joblib import Parallel, delayed
from subprocess import run
import os
from shutil import copyfile,rmtree
import multiprocessing
num_cores = multiprocessing.cpu_count()

try:
    rmtree('/GB100_1/outrigger_wkdir/results')
except:
    pass
os.mkdir('/GB100_1/outrigger_wkdir/results')

def myfun(s3path):
    start_time = time.time()
    
    # parse path for prefix to name outputs
    file_prefix = s3path.split('.')[0].split('/')[-1]
    prefix = '_'.join(file_prefix.split('_')[:2])
    plate = file_prefix.split('_')[1]

    wkdir = f'/GB100_1/outrigger_wkdir/{prefix}'
    output_dir = '/GB100_1/outrigger_wkdir/results'
    results_subdir = f'{output_dir}/{plate}'
    
    # create dir structure
    os.mkdir(wkdir)
    for target_dir in [output_dir, results_subdir]:
        if not os.path.isdir(target_dir):
            os.mkdir(results_subdir)
    
    gtf_file = '/GB100_1/ref/HG38-PLUS/HG38-PLUS/genes/genes.gtf'
    fa_file = '/GB100_1/ref/HG38-PLUS/HG38-PLUS/fasta/genome.fa'
    
    # pull input from s3
    os.chdir('/home/ubuntu/')
    run(['aws', 's3', 'cp', 
         s3path, f'{wkdir}/'])
    
    # run outrigger (approx. 10 min per sample)
    os.chdir(wkdir)
    run(['outrigger', 'index', 
         '--sj-out-tab', f'{file_prefix}.homo.SJ.out.tab',
         '--gtf', gtf_file])
    try:
        os.chdir(wkdir)
        run(['outrigger', 'validate', 
             '--genome', 'hg38',
             '--fasta', fa_file])
    except:
        pass

    # compile results
    for subtype in ['se','mxe']:
        try:
            # /GB100_1/outrigger_wkdir/A10_B000873/outrigger_output/index/se/validated/events.csv
            copyfile(f'{wkdir}/outrigger_output/index/{subtype}/validated/events.csv', 
                     f'{results_subdir}/{prefix}_{subtype}.csv')
        except:
            os.mknod(f'{results_subdir}/{prefix}_{subtype}.csv')
    
    # remove subdir
    rmtree(wkdir)
    
    # record execution time
    etime = time.time() - start_time
    with open('/home/ubuntu/data/DL20181011_melanocyte_test_data/DL20190110_outrigger_timelog.txt', 'a') as f:
        f.write(f'{etime}\n')

# randomly sample 10 paths to time and process
matched_path = np.random.choice(paths_vec, 10)
    
Parallel(n_jobs=1, 
         backend="threading")(map(delayed(myfun), matched_path))


In [6]:
df = pd.read_csv('/home/ubuntu/data/DL20181011_melanocyte_test_data/DL20190110_outrigger_timelog.txt', header = None)
df.columns = ['sec']
df['min'] = df.sec / 60

df.describe()

Unnamed: 0,sec,min
count,10.0,10.0
mean,1634.448382,27.240806
std,271.413223,4.523554
min,1239.832358,20.663873
25%,1469.529451,24.492158
50%,1597.518533,26.625309
75%,1801.916791,30.031947
max,2066.486255,34.441438


In [26]:
paths_vec[:2]

['s3://czbiohub-seqbot/fastqs/180301_NB501961_0074_AH5HKKBGX5/homo_results/A10_B000873_S714.homo.SJ.out.tab',
 's3://czbiohub-seqbot/fastqs/180301_NB501961_0074_AH5HKKBGX5/homo_results/A11_B000873_S715.homo.SJ.out.tab']

In [28]:
jobs_queue = pd.DataFrame({'ec2_id': ['foo', 'i-0f95ea0e27dc6f375'],'path':paths_vec[:2]})
jobs_queue.to_csv('/home/ubuntu/data/DL20181011_melanocyte_test_data/jobs_queue.csv')


In [60]:
import subprocess

jobs_path = 's3://daniel.le-work/MEL_project/DL20190111_outrigger/jobs_queue.csv'

def pull_job(jobs_path):
    s3path = None
    
    # get instance id
    proc = subprocess.run(['ec2metadata', '--instance-id'], 
                              encoding='utf-8', 
                              stdout=subprocess.PIPE)
    ec2_id =  proc.stdout.split('\n')[0]
    
    # pull jobs queue
    jobs_df = pd.read_csv(jobs_path)
    if ec2_id in jobs_df.ec2_id.values:
        s3path = jobs_df[jobs_df.ec2_id == ec2_id].path.tolist()[0]
    elif len(jobs_df) > 0:
        print('No matching jobs')
    else:
        print('No jobs in queue')
    return s3path

s3path = pull_job(jobs_path)
if s3path is None:
    print('failed')


No matching jobs
failed


In [52]:
jobs_df[jobs_df.ec2_id == ec2_id].path.tolist()[0]

's3://czbiohub-seqbot/fastqs/180301_NB501961_0074_AH5HKKBGX5/homo_results/A11_B000873_S715.homo.SJ.out.tab'

In [62]:
ec2_id = 'i-0f95ea0e27dc6f375'
df = pd.DataFrame({'path':['s3://czbiohub-seqbot/fastqs/180301_NB501961_0074_AH5HKKBGX5/homo_results/A10_B000873_S714.homo.SJ.out.tab']})
df.to_csv(f'/home/ubuntu/data/DL20181011_melanocyte_test_data/{ec2_id}.job')


In [63]:
jobs_file = 's3://daniel.le-work/MEL_project/DL20190111_outrigger/queue/i-0f95ea0e27dc6f375.job'
try:
    jobs_df = pd.read_csv(jobs_file)
    s3path = jobs_df.path.values[0]
    print(s3path)
except:
    pass

s3://czbiohub-seqbot/fastqs/180301_NB501961_0074_AH5HKKBGX5/homo_results/A10_B000873_S714.homo.SJ.out.tab
