In [35]:
import pandas as pd
from tqdm import tqdm
import subprocess
import multiprocessing
import boto3
import botocore.exceptions
import re, os, shutil

def get_s3path_list(bucket, prefix, suffix):
    #     bucket = 'darmanis-group'
    #     prefix = 'singlecell_lungadeno/rawdata/fastqs'
    #     suffix = 'fastq.gz'
    
    client = boto3.client('s3')
    paginator = client.get_paginator('list_objects')

    operation_parameters = {'Bucket': bucket,
                            'Prefix': prefix}

    page_iterator = paginator.paginate(**operation_parameters)
    paths = ['s3://{}/{}'.format(bucket, key['Key']) for page in page_iterator for key in page['Contents'] if key['Key'].endswith(suffix)]
    return paths


def restore_file(k):
    obj = s3r.Object('czbiohub-seqbot', k)
    storage_class = obj.storage_class
    restore = obj.restore
    if not obj.restore:
        resp = bucket.meta.client.restore_object(
            Bucket='czbiohub-seqbot',
            Key=k,
            RestoreRequest={'Days': 3}
        )


def restore_files(file_list, n_proc=7):
    """Restore a list of files from czbiohub-seqbot in parallel"""

    global s3r
    s3r = boto3.resource('s3')
    global bucket
    bucket = s3r.Bucket('czbiohub-seqbot')

    print('creating pool')

    p = multiprocessing.Pool(processes=n_proc)

    try:
        print('restoring files...')
        p.map(restore_file, file_list, chunksize=100)
    finally:
        p.close()
        p.join()


def copy_file(k):
    key, new_key = k
    try:
        s3c.head_object(Bucket=new_bucket, Key=new_key)
    except botocore.exceptions.ClientError:
        s3c.copy(CopySource={'Bucket': bucket, 'Key': key},
                 Bucket=new_bucket,
                 Key=new_key)


def copy_files(src_list, dest_list, b, nb, n_proc=6):
    """
    Copy a list of files from src_list to dest_list.
    b - original bucket
    nb - destination bucket
    """

    global s3c
    s3c = boto3.client('s3')

    global bucket
    bucket = b
    global new_bucket
    new_bucket = nb

    try:
        p = multiprocessing.Pool(processes=n_proc)
        p.map(copy_file, zip(src_list, dest_list), chunksize=100)
    finally:
        p.close()
        p.join()

def remove_file(k):
    s3c.delete_object(Bucket=bucket, Key=k)


def remove_files(file_list, b, really=False, n_proc=6):
    """Remove a list of file keys from S3"""

    assert really

    print("Removing {} files!".format(len(file_list)))

    global s3c
    s3c = boto3.client('s3')
    global bucket
    bucket = b

    try:
        p = multiprocessing.Pool(processes=n_proc)
        p.map(remove_file, file_list, chunksize=100)
    finally:
        p.close()
        p.join()
        
def s3copy(args):
    """return failed paths"""
    src, dest = args
    s3cmd = f'aws s3 cp {src} {dest}'.split(' ')
    process = subprocess.run(s3cmd)
    if process.returncode == 0:
        path = None
    else:
        path = src
    return src

def s3copy_parallel(src_list, dest_list, n_proc = 2):
    try:
        p = multiprocessing.Pool(processes=n_proc)
        process = p.map(s3copy, zip(src_list, dest_list), chunksize=100)
    finally:
        p.close()
        p.join()
    return process

def s3move(args):
    """return failed paths"""
    src, dest = args
    s3cmd = f'aws s3 mv {src} {dest}'.split(' ')
    process = subprocess.run(s3cmd)
    if process.returncode == 0:
        path = None
    else:
        path = src
    return src

def s3move_parallel(src_list, dest_list, n_proc = 2):
    try:
        p = multiprocessing.Pool(processes=n_proc)
        process = p.map(s3move, zip(src_list, dest_list), chunksize=100)
    finally:
        p.close()
        p.join()
    return process

def s3rm(src):
    syscmd = f'aws s3 rm {src}'
    process = subprocess.run(syscmd.split(' '))
    if process.returncode == 0:
        path = None
    else:
        path = src
    return src

def s3rm_parallel(paths, n_proc = 2):
    try:
        p = multiprocessing.Pool(processes=n_proc)
        process = p.map(s3rm, paths, chunksize=100)
    finally:
        p.close()
        p.join()
    return process
    

In [None]:
### cells of interest 
coi_df = pd.read_csv('s3://darmanis-group/singlecell_lungadeno/rawdata/cell_list.csv', index_col=0)
coi_df.columns = ['id']

### pull files from putative complete file bucket
done_paths = get_s3path_list('darmanis-group', 'singlecell_lungadeno/rawdata/fastq_all/rename', '.fastq.gz')
R1_paths = [x for x in done_paths if x.endswith('_R1.fastq.gz')]
R2_paths = [x for x in done_paths if x.endswith('_R2.fastq.gz')]

file_ids = ['_'.join(x.split('/')[-1].split('_')[:2]) for x in done_paths]
R1_ids = ['_'.join(x.split('/')[-1].split('_')[:2]) for x in R1_paths]
R2_ids = ['_'.join(x.split('/')[-1].split('_')[:2]) for x in R2_paths]

# how many prefixes are missing from cells of interest?
display(len(set(coi_df['id'].tolist())-set(file_ids)))

# do all the file_ids have both R1 and R2?
display(len(set(coi_df['id'].tolist())-set(R1_ids)),
        len(set(coi_df['id'].tolist())-set(R2_ids))
       )

## output files csv
R1_paths = [f's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/rename/{x}_R1.fastq.gz' for x in coi_df['id'].tolist()]
R2_paths = [f's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/rename/{x}_R2.fastq.gz' for x in coi_df['id'].tolist()]

# output_csv = pd.DataFrame({'prefix':coi_df['id'].tolist(),
#                            'R1':R1_paths,
#                            'R2':R2_paths,
#                           })
# fn = '/home/ubuntu/data/lung_fastq_paths.csv'
# output_csv.to_csv(fn)
# ! aws s3 cp {fn} s3://daniel.le-work/

# get bam paths
bam_paths = get_s3path_list('darmanis-group', 'singlecell_lungadeno/rawdata/fastq_all/rename_results', '.bam')

# check if all bams are in place
bam_prefix = [(x
               .split('/')[-1]
               .split('.homo.Aligned.out.sorted.bam')[0]) for x in bam_paths]

display(len(bam_prefix), # all bams 
        len(coi_df['id'].tolist()), # bams of interest
        len(list(set(bam_prefix)&set(coi_df['id'].tolist()))) # intersect
       )

# check all files in place
if (len(set(coi_df['id'].tolist())-set(file_ids))==0) \
and (len(set(coi_df['id'].tolist())-set(R1_ids))==0) \
and (len(set(coi_df['id'].tolist())-set(R2_ids))==0) \
and (len(coi_df['id'].tolist())==len(list(set(bam_prefix)&set(coi_df['id'].tolist())))):
    print('all good...')
    
    # move files

    ## fastqs
    for src_list in [R1_paths, R2_paths]:
        print('start transfer...')

        keyhead_dest = 's3://ashley-sra/'
        dest_list = ['{}{}'.format(keyhead_dest, x.split('/')[-1]) for x in src_list]

        s3copy_parallel(src_list, dest_list, n_proc = 2)
        print('fastqs complete')

    ## bams
    print('start transfer...')
    src_list =[f's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/rename_results/{x}.homo.Aligned.out.sorted.bam' for x in coi_df['id'].tolist()]
    keyhead_dest = 's3://ashley-sra/'
    dest_list = ['{}{}'.format(keyhead_dest, x.split('/')[-1]) for x in src_list]
    s3copy_parallel(src_list, dest_list, n_proc = 2)
    print('bams complete')

else:
    print('check files!!!')
     

27489

21409

21409

all good...
start transfer...


In [54]:
print('...')
done_fastqs = get_s3path_list('ashley-sra', '', '.fastq.gz')
done_bams = get_s3path_list('ashley-sra', '', '.bam')


...


In [55]:
display(len(done_fastqs)/2,
        len(done_bams),
        len(coi_df['id'].tolist())
       )

21409.0

21409

21409

In [None]:
# print('start transfer...')
# copyback_df = get_s3path_list('ashley-sra', '', '.fastq')
# src_list = copyback_df
# keyhead_dest = 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/rename/'
# dest_list = ['{}{}'.format(keyhead_dest, x.split('/')[-1]) for x in src_list]
# s3move_parallel(src_list, dest_list, n_proc = 4)
# print('bams complete')


start transfer...


In [29]:
# total number of files:
print('number of files:', len(done_df))

# number of file pairs
print('number of file pairs:', sum([x == 2 for x in done_df.groupby(['plate','well']).count().read_code]))

# number of file set != 2
print('number of file sets != 2:', sum([x != 2 for x in done_df.groupby(['plate','well']).count().read_code]))

# number of file pairs that DO NOT have R1+R2
print('number of file pairs that DO NOT have R1+R2:', sum([x!=3 for x in done_df.groupby(['plate','well']).sum().read_code]))

# plate_ids not found
print('plate ids not found:', [x for x in plates_list if x not in done_df.plate.values.tolist()])




number of files: 55204
number of file pairs: 27602
number of file sets != 2: 0
number of file pairs that DO NOT have R1+R2: 0
plate ids not found: ['1001000326', '1001000338', '1001000328']


In [28]:
[x for x in plates_list if x not in done_df.plate.values.tolist()]

['1001000326', '1001000338', '1001000328']

In [None]:
# rename and move files from fastq_all to renamed

done_paths = get_s3path_list('darmanis-group', 'singlecell_lungadeno/rawdata/fastq_all', '.fastq.gz')
done_fn = [x.split('/')[-1] for x in done_paths]
done_well = [re.split('-|_', x)[0] for x in done_fn]
done_plate = [re.split('-|_', x)[1] for x in done_fn]
done_df = pd.DataFrame({'fn':done_fn,
                       'well':done_well,
                       'plate':done_plate})
keyhead_src = 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/'
src_list = [f'{keyhead_src}{fn}' for fn in done_df.fn]

keyhead_dest = 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/rename/'
dest_list = [f'{keyhead_dest}{well}_{plate}_R1.fastq.gz' if '_R1_' in fn else f'{keyhead_dest}{well}_{plate}_R2.fastq.gz' for well,plate,fn in zip(done_df.well, done_df.plate, done_df.fn)]

s3move_parallel(src_list, dest_list, n_proc = 6)


In [26]:
# move files from yes_cat to renamed

done_paths = get_s3path_list('darmanis-group', 'singlecell_lungadeno/rawdata/fastq_all/yes_cat', '.fastq.gz')
done_fn = [x.split('/')[-1] for x in done_paths]
done_well = [re.split('-|_', x)[0] for x in done_fn]
done_plate = [re.split('-|_', x)[1] for x in done_fn]
done_df = pd.DataFrame({'fn':done_fn,
                       'well':done_well,
                       'plate':done_plate})
keyhead_src = 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/'
src_list = [f'{keyhead_src}{fn}' for fn in done_df.fn]

keyhead_dest = 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/rename/'
dest_list = [f'{keyhead_dest}{fn}' for fn in done_df.fn]
s3move_parallel(src_list, dest_list, n_proc = 6)

['s3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A10_1001000372_R1.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A10_1001000372_R2.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A10_1001000374_R1.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A10_1001000374_R2.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A10_B003047_R1.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A10_B003047_R2.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A11_1001000372_R1.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A11_1001000372_R2.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A11_1001000374_R1.fastq.gz',
 's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/A11_1001000374_R2.fastq.gz',
 's3://darmanis-group/sing

# above has been refactored

In [25]:
# all plates, user-defined
plates_df = pd.read_csv('s3://darmanis-group/singlecell_lungadeno/rawdata/190117_csLung_plates.csv')
plates_list = plates_df.plate.tolist()
print(plates_list)


['1001000302', '1001000301', '1001000292', '1001000293', '1001000294', '1001000295', '1001000314', '1001000316', '1001000317', '1001000320', '1001000322', '1001000326', '10001000325', '1001000338', '1001000315', '1001000327', '1001000328', '1001000329', '1001000339', '1001000340', '1001000362', '1001000363', '1001000365', '1001000366', '1001000367', '1001000372', '1001000373', '1001000374', '1001000375', '1001000376', '1001000377', '1001000378', '1001000380', '1001000407', '1001000408', '1001000409', '1001000410', '1001000411', '1001000412', '1001000413', 'B003044', 'B003046', 'B003048', 'B003047', 'B003049', 'B001607', 'B001608', 'B001617', 'B001621', 'B003067', 'B003070', 'B003071', 'B002495', 'B001625', 'B001626', 'B001627', 'B001788', 'B003472', 'B000278', 'B000279', 'B000281', 'B000280', 'B000429', 'B000430', 'B000260', 'B000261', 'B000262', 'B000883', 'B000887', 'B000894', 'B000901', 'B000870', 'B000872', 'B000860', 'B000861', 'B000862', 'B000863', 'B000569', 'B000573', 'B000575'

In [None]:
# all paths in original folder
darmanis_paths = get_s3path_list('darmanis-group', 'singlecell_lungadeno/rawdata/fastqs', '.fastq.gz')

# plate id in darmanis bucket
darmanis_paths_filtered = [pathOI for pathOI in darmanis_paths if any([plate in pathOI for plate in plates_list])]
darmanis_plates = [plate for plate in plates_list if any([plate in path for path in darmanis_paths_filtered])]
seqbot_plates = [plate for plate in plates_list if plate not in darmanis_plates]


In [None]:
# pull frozen seqbot file path index
s3index = ['daniel.le-work/MEL_project/DL20190114_czbiohubseqbot.txt',
          'daniel.le-work/MEL_project/DL20190114_czbseqbot.txt']
full_index = pd.DataFrame()
for path in s3index:
    df = pd.read_csv(f's3://{path}', header=None)
    full_index = full_index.append(df)


In [None]:
# filter based on file ext and inclusion in plate list
seqbot_paths = full_index.values[:,0].tolist()
seqbot_paths_filtered = [path for path in seqbot_paths if path.endswith('fastq.gz')]
seqbot_paths_filtered = [path for path in seqbot_paths_filtered if any([plate in path for plate in seqbot_plates])]
del full_index


In [None]:
print('seqbots:', len(seqbot_paths_filtered))
print('darmanis:', len(darmanis_paths_filtered))


In [None]:
all_paths = seqbot_paths_filtered + darmanis_paths_filtered
not_found_plates = [plate for plate in plates_list if not any([plate in x for x in all_paths])]
print('removed_plates:', not_found_plates)
# remove plates that were manually deleted after the s3 index freeze
all_paths = [x for x in all_paths if not any([y in x for y in not_found_plates])]
print('all:', len(all_paths))


In [None]:
# remove files without suffix
remove files from without suffix from provided keys
paths = []
file_heads = ['fastqs/171120_A00111_0085_AH57YYDMXX/rawdata',
'fastqs/171120_A00111_0086_BH55NVDMXX/rawdata']
for x in file_heads:
    path_batch = get_s3path_list('czbiohub-seqbot', x, '.fastq.gz')
    paths = paths + path_batch

fns = [x.split('/')[-1] for x in paths]
fns = list(set(fns))
keys_to_remove = [f's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/{x}' for x in fns]
# process = s3rm_parallel(keys_to_remove, n_proc = 6)


In [22]:
# return number of files that have been transferred to destination
done_paths = get_s3path_list('darmanis-group', 'singlecell_lungadeno/rawdata/fastq_all', '.fastq.gz')
print('completed transfers:', len(done_paths))
done_fn = [x.split('/')[-1] for x in done_paths]
done_well = [re.split('-|_', x)[0] for x in done_fn]
done_plate = [re.split('-|_', x)[1] for x in done_fn]
done_df = pd.DataFrame({'fn':done_fn,
                       'well':done_well,
                       'plate':done_plate})

# get paths for cells with multiple lanes
done_counts = done_df.groupby(['well','plate']).count().reset_index()
multiple_counts = (done_counts[[x != 2 for x in done_counts.fn]]
                   .drop('fn', axis=1))
multiple_counts['idx'] = [x for x in range(len(multiple_counts))]
multiple_counts = pd.merge(done_df, multiple_counts, 'right', ['well','plate'])



completed transfers: 62494


In [None]:
src_list = [f's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/{x}' for x in multiple_counts.fn]
dest_list = [f's3://darmanis-group/singlecell_lungadeno/rawdata/fastq_all/not_cat/{x}' for x in multiple_counts.fn]
process = s3move_parallel(src_list, dest_list, n_proc = 6)


In [75]:
def concat_fastq(args): 
    idx, well, plate, fns = args
    global wkdir
    global s3parent
    global s3return
    os.mkdir(f'{wkdir}{idx}/')
    for read in ['R1','R2']:
        fns_filtered = [x for x in fns if read in x]
        for fn in fns:
            # download from fastq_all into tmp
            copycmd = f'aws s3 cp s3://{s3parent}{fn} {wkdir}{idx}/{fn}'
            subprocess.run(copycmd.split(' '))
        # cat files that contain R1 or R2, respectively
        catargs = [f'{wkdir}{idx}/{x}' for x in fns_filtered]
        catcmd = ' '.join(['cat'] + catargs + ['>', f'{wkdir}{idx}/{well}_{plate}_{read}.fastq.gz'])
        ! {catcmd} 
#         subprocess.call(catcmd.split(' '), stdout=f) # could not figure how to use > operator inside subprocess
        # push back to s3 fastq_all
        pushcmd = f'aws s3 mv {wkdir}{idx}/{well}_{plate}_{read}.fastq.gz s3://{s3return}'
        subprocess.run(pushcmd.split(' ')) 
    # remove dir
    shutil.rmtree(f'{wkdir}{idx}/')

def concat_parallel(idxs, wells, plates, fns_list, n_proc = 6):
    try:
        p = multiprocessing.Pool(processes=n_proc)
        p.map(concat_fastq, zip(idxs, wells, plates, fns_list), chunksize=100)
    finally:
        p.close()
        p.join()

In [None]:
wkdir = '/home/ubuntu/data/DL20181011_melanocyte_test_data/tmp/'
s3parent = 'darmanis-group/singlecell_lungadeno/rawdata/fastq_all/not_cat/'
s3return = 'darmanis-group/singlecell_lungadeno/rawdata/fastq_all/yes_cat/'
idxs = list(set([idx for idx in multiple_counts.idx]))
wells = []
plates = []
fns_list = []
for idx in idxs:
    df_splice = multiple_counts[multiple_counts.idx == idx]
    wells.append(df_splice.well.values[0])
    plates.append(df_splice.plate.values[0])
    fns_list.append(df_splice.fn.values.tolist())

n = len(idxs)
# n = 6 # for testing
concat_parallel(idxs[:n], wells[:n], plates[:n], fns_list[:n], n_proc = 6)

In [None]:
# rename doubles:
old_bucket = 'czbiohub-seqbot'
new_bucket = 'darmanis-group'
new_key = 'singlecell_lungadeno/rawdata/fastq_all/'
file_ext = '.fastq.gz'

file_heads = ['fastqs/171120_A00111_0085_AH57YYDMXX/rawdata/',
'fastqs/171120_A00111_0086_BH55NVDMXX/rawdata/']

# remove old copy without suffix indexing
master_keys = []
for idx, file_head in enumerate(file_heads):
    key_paths = get_s3path_list(old_bucket, file_head, file_ext) # all runs on given date
    key_paths = [x for x in key_paths if any([plate in x for plate in plates_list])] # must have right plate id
    master_keys = master_keys + key_paths
    
master_keys = list(set(master_keys))
master_fn = [x.split('/')[-1] for x in master_keys]
key_list = [f'{new_key}{x}' for x in master_fn]
remove_files(key_list, new_bucket, really=False, n_proc=6)


In [None]:
# restore
for idx, file_head in enumerate(file_heads):
    key_paths = get_s3path_list(old_bucket, file_head, file_ext) # all runs on given date
    key_paths = [x for x in key_paths if any([plate in x for plate in plates_list])] # must have right plate id
    keys = [x.split(f's3://{old_bucket}/')[-1] for x in key_paths]
    restore_files(keys, n_proc=7)

In [None]:
# copy with suffix indexing
process = []
for idx, file_head in enumerate(file_heads):
    key_paths = get_s3path_list(old_bucket, file_head, file_ext) # all runs on given date
    key_paths = [x for x in key_paths if any([plate in x for plate in plates_list])] # must have right plate id
    
    src_key = key_paths
    src_paths_fn = [path.split('/')[-1] for path in key_paths]
    src_paths_fn_head = [x.split(f'{file_ext}')[0] for x in src_paths_fn]
    dest_key = [f's3://{new_bucket}/{new_key}{x}_{idx}{file_ext}' for x in src_paths_fn_head]
    
    process = process + s3copy_parallel(src_key, dest_key, n_proc = 7)


# above is non-refactored code to keep

In [None]:
# frequency of matching set
done_df = pd.DataFrame({'path':done_paths})
done_df['fn_head'] = ['_'.join(re.split('-|_', x)[:2]) for x in done_df.path]
done_df['fn_head'].value_counts().value_counts()

In [None]:
# all filenames end with either 'R1_001.fastq.gz' or'R2_001.fastq.gz'
print(len([x for x in done_df.path if x.endswith(('R1_001.fastq.gz','R2_001.fastq.gz'))]))
print(len(done_df))

In [None]:
# unit test to confirm paired sets contain both R1 and R2
match_freq_df = (done_df['fn_head']
                 .value_counts()
                 .reset_index())
match_freq_df = (match_freq_df[match_freq_df.fn_head == 2]
                 .iloc[:,0])
match_freq_df = pd.DataFrame(match_freq_df)
match_freq_df.columns = ['fn_head']

filtered_done_df = pd.merge(pd.DataFrame(match_freq_df),done_df,'left','fn_head')
# R1 match = 1, R2 match =2, no_match = 0
# sum of ends_score = 3 for contains both R1 and R2
# sum of ends_score = 2 for contains two R1
# sum of ends_score = 4 for contains two R2
# sum of ends_score = 0 for contains neither R1 nor R2

filtered_done_df['ends_score'] = [1 if x.endswith('R1_001.fastq.gz') else 2 if x.endswith('R2_001.fastq.gz') else 0 for x in filtered_done_df.path]
any([x!=3 for x in filtered_done_df.groupby(['fn_head']).sum().ends_score])
# all passed having both R1 and R2


In [None]:
match_freq_df = (done_df['fn_head']
                 .value_counts()
                 .reset_index())
match_freq_df = (match_freq_df[match_freq_df.fn_head != 2]
                 .iloc[:,0])
match_freq_df = pd.DataFrame(match_freq_df)
match_freq_df.columns = ['fn_head']
filtered_done_df = pd.merge(pd.DataFrame(match_freq_df),done_df,'left','fn_head')
for x in filtered_done_df.path:
    print(x)


In [None]:
def s3copy(path):
    global dest
    s3cmd = f'aws s3 cp {path} {dest}'.split(' ')
    process = subprocess.run(s3cmd)
    if process.returncode != 0:
        path = None
    return path

dest = '/home/ubuntu/data/DL20181011_melanocyte_test_data/tmp/'
file_heads = list(set([file_head for file_head in ['_'.join(re.split('-|_', x)[:2]) for x in filtered_done_df.path]]))

for file_head in file_heads:
    file_subset = [x for x in filtered_done_df.path if x.startswith(file_head)]
    key_list = file_subset


    p = mp.Pool(processes=4)
    try:
        print('copying files...')
        process = process + p.map(s3copy, key_list, chunksize=100)
    finally:
        p.close()
        p.join()


# dev

In [None]:
# very slow!
# check s3 glacier status
def s3status(path):
    try:
        bucket_name = path.split('/')[2]
        key = path.split(f'{bucket_name}/')[-1]

        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        obj = s3.Object(bucket_name, key)
        status = obj.storage_class
    except ClientError as e:
        status = e
    return status


res = s3status(failed_transfers.path.tolist()[0])
print(res)


In [None]:
print(sum([x==None for x in status_results]))
print(sum([x=='GLACIER' for x in status_results]))
remaining_defrost = [path for path,state in zip(remaining_paths, status_results) if state == 'GLACIER']

In [None]:
# restore s3 from glacier
def s3restore(path):
    try:
        bucket_name = path.split('/')[2]
        key = path.split(f'{bucket_name}/')[-1]
        
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        resp = bucket.meta.client.restore_object(
                Bucket=bucket_name,
                Key=key,
                RestoreRequest={'Days': 1}
            )
    except Exception as e:
        resp = e
    return resp

result = []
for path in tqdm(remaining_paths):
    res = s3restore(path)
    result.append(res)


In [None]:
restore_results

In [None]:
path = remaining_paths[0]

bucket_name = path.split('/')[2]
key = path.split(f'{bucket_name}/')[-1]
print(path, bucket_name, key)

try:
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    resp = bucket.meta.client.restore_object(
            Bucket=bucket_name,
            Key=key,
            RestoreRequest={'Days': 1}
        )
except ClientError as e:
    resp = e
return e



In [None]:
path = remaining_paths[0]
bucket_name = path.split('/')[2]
key = path.split(f'{bucket_name}/')[-1]
s3 = boto3.client('s3')

s3.head_object(Bucket=bucket_name, Key=key)

In [None]:
def checkexist(path):
    bucket_name = path.split('/')[2]
    key = path.split(f'{bucket_name}/')[-1]
    s3 = boto3.client('s3')

    try:
        s3.head_object(Bucket=bucket_name, Key=key)
        value = True
    except:
        value = False
    return value

result = []
for path in tqdm(remaining_paths[0]):
    res = checkexist(path)
    result.append(res)
    

In [None]:
set(['/'.join(path.split('/')[2:5]) for path in results])

In [None]:
# takes too long

# bucket_list = ['czbiohub-seqbot', 'czb-seqbot']

# seqbot_paths = []
# for bucket in bucket_list:
#     suffix = 'fastq.gz'

#     client = boto3.client('s3')
#     paginator = client.get_paginator('list_objects')
#     operation_parameters = {'Bucket': bucket}
#     page_iterator = paginator.paginate(**operation_parameters)
#     seqbot_paths = seqbot_paths + ['s3://{}/{}'.format(bucket, key['Key']) for page in page_iterator for key in page['Contents'] if key['Key'].endswith(suffix)]
    


In [None]:
# s3 defrost from cli test
# aws s3api restore-object --bucket mybucket --key dir1/example.obj --restore-request '{"Days":25,"GlacierJobParameters":{"Tier":"Standard"}}'
def defrostcli(path):
    bucket_name = path.split('/')[2]
    key = path.split(f'{bucket_name}/')[-1]

    syscmd = f'aws s3api restore-object --bucket {bucket_name} --key {key} --restore-request'
    suffix = '{"Days":25,"GlacierJobParameters":{"Tier":"Standard"}}'
    full_syscmd = syscmd.split(' ') + [suffix] 
    process = subprocess.run(full_syscmd)

    return process

# restore_results = []
# for path in tqdm(failed_transfers.path.tolist()):
#     res = defrostcli(path)
#     restore_results.append(res)
    


In [1]:
test = 'test'
test[:-1]

'tes'