In [1]:
import pandas as pd
import multiprocessing as mp
import boto3
import botocore.exceptions
import re, os, shutil, subprocess
from tqdm import tqdm

def get_s3path_list(bucket, prefix, suffix):
    #     bucket = 'darmanis-group'
    #     prefix = 'singlecell_lungadeno/rawdata/fastqs'
    #     suffix = 'fastq.gz'

    client = boto3.client('s3')
    paginator = client.get_paginator('list_objects')
    operation_parameters = {'Bucket': bucket,
                            'Prefix': prefix}
    page_iterator = paginator.paginate(**operation_parameters)
    paths = ['s3://{}/{}'.format(bucket, key['Key']) for page in page_iterator for key in page['Contents'] if key['Key'].endswith(suffix)]
    return paths

def merge_counts(top_dir):
    # Create big counts table from local tables
    file_list = [filename for filename in glob.iglob(top_dir + '**/*.txt', recursive=True)]
    first_df = pd.read_csv(file_list[0], header=None, delimiter='\t')
    num_row = len(first_df)
    rownames = first_df.iloc[:,0].tolist()
    num_col = len(file_list)
    colnames = []
    empty_array = np.zeros((num_row, num_col))
    
    for idx, file in tqdm.tqdm(enumerate(file_list)):
        pulled_col = pd.read_csv(file, header=None, delimiter='\t', usecols=[1])
        colname = '_'.join(file.split('/')[-1].split('_')[:2] + ['0'])
        if colname in colnames:
            name_split = colname.split('_')
            new_idx =  int(name_split[-1]) + 1
            colname = '_'.join(name_split[:2] + [new_idx])
        colnames.append(colname)
        empty_array[:,idx] = pulled_col.values.reshape((len(pulled_col),))
    
    # convert numpy to pandas
    master_df = pd.DataFrame(empty_array)
    master_df.columns = colnames
    master_df['gene'] = rownames
    
    # remove metadata 
    master_df = master_df[["__" not in x for x in master_df.gene]]
    
    # reset gene col
    master_df = master_df.set_index('gene').reset_index()
    
    return master_df


In [2]:
# pull s3 paths
# iterate: pull and append to master
# save to disk and push to s3


In [3]:
# pull s3 paths
bucket = 'czb-seqbot'
prefix = 'fastqs/190412_A00111_0295_AHJCT7DSXX/rawdata/Ashley_Maynard/results'
suffix = '.homo.htseq-count.txt'
paths = get_s3path_list(bucket, prefix, suffix)
len(paths)


1403

In [4]:
# iterate: pull and append to master
null_df = pd.read_csv(paths[0], delimiter='\t', header=None)
null_df = null_df[[not x.startswith('__') for x in null_df.iloc[:,0]]]

master_array = np.zeros((len(null_df), len(paths)))
for idx, i in tqdm(enumerate(paths)):
    tmp_df = pd.read_csv(i, delimiter='\t', header=None)
    tmp_df = tmp_df[[not x.startswith('__') for x in tmp_df.iloc[:,0]]]
    tmp_vec = tmp_df.iloc[:,1].values
    master_array[:,idx] = tmp_vec
    

1403it [02:36,  8.95it/s]


In [8]:
# save to disk and push to s3
filename = 'DL20190417_adult_v2.csv'
master_df = pd.DataFrame(master_array).astype(int)
master_df['gene'] = tmp_df.iloc[:,0].values
master_df = master_df.set_index('gene')
colnames = ['_'.join(x.split('/')[-1].split('_')[:2]) for x in paths]
master_df.columns = colnames
master_df.to_csv(f'/home/ubuntu/data/DL20181011_melanocyte_test_data/{filename}')
! aws s3 cp /home/ubuntu/data/DL20181011_melanocyte_test_data/{filename} s3://daniel.le-work/MEL_project/
    

upload: ../../data/DL20181011_melanocyte_test_data/DL20190417_adult_v2.csv to s3://daniel.le-work/MEL_project/DL20190417_adult_v2.csv
