In [77]:
# Get trinity worm data from 0mM and 400mM N2
# Conny Lin | June 2, 2020
# connylin@doctor.com
# 
# concat trinity.wormid.dat files into one csv file
# add worm id and ethanol groups (0 = no ethanol, 1 = 400mM ethanol)
# save the file

In [78]:
import os
import pickle
import numpy as np
import pandas as pd
import time

In [79]:
# set test sample size (if 0, will run the complete file list)
test_sample_size = 0
output_filename = 'trinity_data.csv'
output_temp_trinity = 'temp_trinity.csv'
output_temp_id = 'temp_wormid_eth.csv'
dir_save = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data'
random_number = 318
start_time = time.time()

In [80]:
# get file info file --
p = os.path.join(dir_save, 'fileinfo_trinity_N2400mM.pickle')
fileinfoDB = pickle.load(open(p,'rb'))

In [81]:
# get a sample size or the full set
total_file_number = fileinfoDB.shape[0]
if test_sample_size > 0:
    df_target = fileinfoDB.sample(test_sample_size, random_state=random_number).copy()
else:
    df_target = fileinfoDB
del fileinfoDB

In [82]:
# concat --
# create bash command components
bashcommand_prefix = 'cat'
bashcommand_sourcepaths = ''
bashcommand_space = ' '
bashcommand_link = '>'
bashcommand_outputpath = '"'+os.path.join(dir_save, output_temp_trinity)+'"'

# link file paths
print('linking file path')
for p in df_target['path'].values:
    p = '"'+p+'"'
    bashcommand_sourcepaths += p
    bashcommand_sourcepaths += ' '
# create bash command
bashcommand = bashcommand_prefix + bashcommand_space + \
            bashcommand_sourcepaths + \
            bashcommand_link + bashcommand_space + \
            bashcommand_outputpath
# call bash command
os.system(bashcommand)


# create wormid array and ethanol array --
# get info
worm_id_list = df_target['worm_id'].values
ethanol_list = df_target['ethanol'].values
rows_list = df_target['row_number'].values
row_total = df_target['row_number'].sum()
# instantiate arrays
worm_id_array = np.empty([row_total],dtype='int')
ethanol_array = np.empty([row_total],dtype='int')

# create arrays
print('creating worm id and ethanol id array')
i_row_previous = 0
for worm_id, ethanol, rows in zip(worm_id_list, ethanol_list, rows_list):
    # get start row position
    i_start_row = i_row_previous
    # get end row position
    i_end_row = i_row_previous+rows-1
    # get worm id array x number of rows
    worm_id_array[i_start_row:i_end_row+1] = np.tile(worm_id, rows)
    # get ethanol array x number of rows
    ethanol_array[i_start_row:i_end_row+1] = np.tile(ethanol, rows)
    # create next starting position
    i_row_previous = rows+i_row_previous
    
    
# save to space deliminated without header --
print('putting worm id and ethanol id array into dataframe')
# create data frame
df = pd.DataFrame({'worm_id':worm_id_array, 
                   'ethanol':ethanol_array})
# save csv
df.to_csv(path_id_array, sep=' ',index=False, header=False)

# bash concat
print('bash concatenate worm id, ethanol id array with trinity.dat')
bashcommand = "paste '" + \
            os.path.join(dir_save, output_temp_id) + \
            "' '" + \
            os.path.join(dir_save, output_temp_trinity) + \
            "' > '" + \
            os.path.join(dir_save, output_filename) + \
            "'"
os.system(bashcommand)

# report estimate time
end_time = time.time()
process_time = end_time - start_time
print(f'processing time: {process_time:.2f}s')
if test_sample_size > 0:
    est_time = total_file_number / test_sample_size * process_time
    print(f'estimate time for full set: {est_time:.2f}s or {est_time/3660:.2f}hr')
start_time = time.time()

processing time: 38.62s
estimate time for full set: 4120.47s or 1.13hr
