In [9]:
# Get trinity worm data from 0mM and 400mM N2
# Conny Lin | June 3, 2020
# connylin@doctor.com
#
# combine all trinity within a plate into pickle

In [10]:
# import libraries
import os
import pickle
import numpy as np
import pandas as pd
import subprocess
import glob



In [11]:
dir_save = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data'

In [12]:
# get links to each plate --
# load pickle
plateDB = pickle.load(open(os.path.join(dir_save, 'file_summary_mwt.pickle'),'rb'))
# get paths with trinity.id.dat
pMWT = plateDB.index[~plateDB[('filepath','trinity.id.dat')].isna()].values

col_ind_keep = [0,3,5,6,8,9,10,11,12,13,14,15,16,17]
# load column names legend
p = os.path.join(dir_save, 'legend_trinity_worm.pickle')
cnames = pickle.load(open(p,'rb'))
cnames = cnames['name'].values
cnames_keep = cnames[col_ind_keep].copy()

In [None]:
# go in each plate
# for p in pMWT:
starting_plate = 425
for plate_count, pPlate in enumerate(pMWT[starting_plate:],starting_plate+1):
    print(f'processing {plate_count}/{pMWT.shape[0]} plate')
    # find all trinity files
    ptri = glob.glob(pPlate+"/*.trinity.*.dat")
    # print number of files
    print(f'\t{len(ptri)} trinity files in this plate')


    # get worm id from filep path
    df = pd.DataFrame({'paths':ptri})
    worm_number = df['paths'].str.extract(r'(?<=trinity[.])(\d{1,})')
    worm_number = worm_number.astype('int32').values

    # get number of rows per doc
    # declare empty array to store the row numbers
    rows_array = np.empty((len(ptri)),dtype='int')
    # loop through each files 
    for i, p in enumerate(ptri, 0):
        # report process every 100 files
        if (i%100==0):
            print(f'\tprocessing #{i} files', end='\r')
        # create custom bash command
        bashCommand = "cat "+p+" | wc -l"
        # call bash
        output = subprocess.check_output(bashCommand, shell=True)
        # take out digits from the byte output
        rows_array[i] = int(output)


    # create worm id array
    # get info
    row_total = rows_array.sum()
    # instantiate arrays
    row_repeats_array = np.empty([row_total], dtype='int')
    # create arrays
    i_row_previous = 0
    for worm_id, rows in zip(worm_number, rows_array):
        # get start row position
        i_start_row = i_row_previous
        # get end row position
        i_end_row = i_row_previous+rows-1
        # get worm id array x number of rows
        row_repeats_array[i_start_row:i_end_row+1] = np.tile(worm_id, rows)
        # create next starting position
        i_row_previous = rows+i_row_previous
    
    
    # concat paths (process by 1000/batch)
    # copy path file for trimming
    ptri_bash = ptri.copy()
    psave_trinity_list = []
    # while pathfile still has stuff in it
    while len(ptri) > 0:
        ptri_len = len(ptri_bash)
        ptri_bash_process = ptri_bash[:1000].copy()
        ptri_bash = ptri_bash[1000:].copy()

        bashcommand_sourcepaths = ''
        for p in ptri_bash_process:
            p = "'"+p+"'"
            bashcommand_sourcepaths += p
            bashcommand_sourcepaths += ' '
        # create bash command components
        n_files = len(psave_trinity_list)
        psave_trinity = os.path.join(pPlate, f'temp_trinity{n_files}.dat')
        bashcommand = 'cat ' + bashcommand_sourcepaths + '> ' + "'"+ psave_trinity + "'"
        psave_trinity_list.append(psave_trinity)
        print(f'\tslice process trinity: {psave_trinity}')
        # call bash command
        os.system(bashcommand)

    # cat temp trinity files
    psave_trinity = os.path.join(pPlate, 'temp_trinity.dat')
    if len(psave_trinity_list)>0:
        bashcommand_sourcepaths = ''
        for p in psave_trinity_list:
            p = "'"+p+"'"
            bashcommand_sourcepaths += p
            bashcommand_sourcepaths += ' '
        bashcommand = 'cat ' + bashcommand_sourcepaths + '> ' + "'"+ psave_trinity + "'"
        os.system(bashcommand)
        # remove files
        for p in psave_trinity_list:
            os.remove(p)
        print('\tcombine slice processed trinity')

    # load tirnity concatenated
    df = pd.read_csv(psave_trinity, delim_whitespace=True, header=None, usecols=col_ind_keep, names=cnames_keep)
    # check if wormid array are the same size as the concatenated array
    if df.shape[0] == row_repeats_array.shape[0]:
        # add wormid field
        df['wormid'] = row_repeats_array
        # into pickle
        psave_pickle = os.path.join(pPlate, 'trinity_all_worms.pickle')
        
    else:
        # print warning
        array_dff = df.shape[0] - row_repeats_array.shape[0]
        print(f'\t!!wormid array has {array_dff} rows of the trinity data')
        print(f'\t!!save two data separately')
        # name the df as no worm id
        psave_pickle = os.path.join(pPlate, 'trinity_all_worms_nowormid.pickle')
        # save worm id file
        dfw = pd.DataFrame({'wormid': row_repeats_array})
        # save
        pickle.dump(dfw, open(os.path.join(dir_save,'trinity_all_worms_wormid_mismatch.pickle'),'wb'))


    # save pickle
    pickle.dump(df, open(psave_pickle,'wb'))
    
    # check if pickle file saved
    if os.path.exists(psave_pickle):
        # report size of the output
        size_trinity = os.path.getsize(psave_pickle)
        print(f'\tsize of pickle file: {size_trinity/1000**2:.0f} MB')
    else:
        print('\tpickle file did not save')
    
    # clean up temp files
    if os.path.exists(psave_trinity):
        print('\tclean up trinity temp file')
        os.remove(psave_trinity)
