In [5]:
# Get trinity worm data from 0mM and 400mM N2
# Conny Lin | June 2, 2020
# connylin@doctor.com
# 
# concat trinity.wormid.dat files into one csv file
# add worm id and ethanol groups (0 = no ethanol, 1 = 400mM ethanol)
# save the file
# !BUG! if sample size is set to 2000, save to csv doesn't seem to work

# import libraries
import os
import pickle
import numpy as np
import pandas as pd
import time

# set test sample size (if 0, will run the complete file list)
test_sample_size_random = 0 #!BUG! 
test_sample_size_sequential = 2000
test_sample_size = test_sample_size_random+test_sample_size_sequential
output_filename = 'test_trinity_data.csv'
output_temp_trinity = 'temp_trinity.csv'
output_temp_id = 'temp_wormid_eth.csv'
output_db = 'test_trinity_db.pickle'
dir_save = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data'
random_number = 1


# delete temp files if exist
print('remove existing files')
temp_files = [output_filename, output_temp_trinity, output_temp_id]
for fn in temp_files:
    p = os.path.join(dir_save, fn)
    if os.path.exists(p):
        os.remove(p)
        
# get file info file --
p = os.path.join(dir_save, 'fileinfo_trinity_N2400mM.pickle')
fileinfoDB = pickle.load(open(p,'rb'))
filenumber_allfiles = fileinfoDB.shape[0]

# get a sample size or the full set
if test_sample_size_random > 0:
    print(f'taking random {test_sample_size_random} sample files for testing')
    df_target = fileinfoDB.sample(test_sample_size_random, random_state=random_number).copy()
elif test_sample_size_sequential > 0:
    print(f'taking first {test_sample_size_random} sample files for testing')
    df_target = fileinfoDB.iloc[:test_sample_size_sequential+1,:].copy()
else:
    print(f'processing the full set of files')
    df_target = fileinfoDB
print(f'{df_target.shape[0]} files to process')
row_total = df_target['row_number'].sum()
print(f'{row_total} rows for the {output_filename} output')
del fileinfoDB

# write database
p = os.path.join(dir_save, output_db)
fileinfoDB = pickle.dump(df_target, open(p,'wb'))

# start timer
print('start timer')
start_time = time.time()

remove existing files
taking first 0 sample files for testing
2001 files to process
4300854 rows for the test_trinity_data.csv output
start timer


In [6]:
# -- create wormid array and ethanol array --
# get info
worm_id_list = df_target['worm_id'].values
ethanol_list = df_target['ethanol'].values
rows_list = df_target['row_number'].values
# instantiate arrays
worm_id_array = np.empty([row_total],dtype='int')
ethanol_array = np.empty([row_total],dtype='int')
# create arrays
print(f'creating worm id and ethanol id array\t\t\t\t\t{(time.time() - start_time)/60:.2f}min elapsed')
i_row_previous = 0
for worm_id, ethanol, rows in zip(worm_id_list, ethanol_list, rows_list):
    # get start row position
    i_start_row = i_row_previous
    # get end row position
    i_end_row = i_row_previous+rows-1
    # get worm id array x number of rows
    worm_id_array[i_start_row:i_end_row+1] = np.tile(worm_id, rows)
    # get ethanol array x number of rows
    ethanol_array[i_start_row:i_end_row+1] = np.tile(ethanol, rows)
    # create next starting position
    i_row_previous = rows+i_row_previous
# check output
print(f'shapes of worm id: {worm_id_array.shape[0]}')
# save to space deliminated without header --
print(f'saving worm id and ethanol id array into csv\t\t\t\t{(time.time() - start_time)/60:.2f}min elapsed')
# create data frame
df = pd.DataFrame({'worm_id':worm_id_array, 
                   'ethanol':ethanol_array})
# save csv #!BUG! 
df.to_csv(os.path.join(dir_save, output_temp_id), 
          sep=' ',index=False, header=False)
# get size of output
size_id_array_mb = (os.path.getsize(os.path.join(dir_save, output_temp_id))) 
print(f'id array size:\t\t\t\t\t\t\t\t{size_id_array_mb/1000**2:.0f} MB')
print(f'estimate trinity array size:\t\t\t\t\t\t{(size_id_array_mb/1000**2)*df_target.shape[1]:.0f} MB')

creating worm id and ethanol id array					0.00min elapsed
shapes of worm id: 4300854
saving worm id and ethanol id array into csv				0.01min elapsed
id array size:								29 MB
estimate trinity array size:						371 MB


In [7]:
# -- concat all trinity files --
# link file paths
print('linking file path')
bashcommand_sourcepaths = ''
for p in df_target['path'].values:
    p = '"'+p+'"'
    bashcommand_sourcepaths += p
    bashcommand_sourcepaths += ' '
# create bash command components
bashcommand_prefix = 'cat'
bashcommand_space = ' '
bashcommand_link = '>'
bashcommand_outputpath = '"'+os.path.join(dir_save, output_temp_trinity)+'"'
# create bash command
bashcommand = 'cat ' + bashcommand_sourcepaths + '> ' + bashcommand_outputpath
# call bash command
os.system(bashcommand)
# report output size
size_trinity = (os.path.getsize(os.path.join(dir_save, output_temp_trinity)))
print(f'trinity array size:\t\t\t\t\t\t\t{size_trinity/1000**2:.0f} MB')

linking file path
trinity array size:							0 MB


In [8]:
# -- bash concat id array and trinity array --
print(f'bash concatenate worm id, ethanol id array with trinity.dat\t\t{(time.time() - start_time)/60:.2f}min elapsed')
bashcommand = "paste '" + \
            os.path.join(dir_save, output_temp_id) + \
            "' '" + \
            os.path.join(dir_save, output_temp_trinity) + \
            "' > '" + \
            os.path.join(dir_save, output_filename) + \
            "'"
os.system(bashcommand)

# report estimate time
processing_time =(time.time() - start_time)/60
print(f'-------\ntotal processing time: \t\t\t\t\t\t\t{processing_time:.2f}min elapsed')
print(f'estimate processing time for full set:\t\t\t\t\t{filenumber_allfiles/test_sample_size*processing_time:.0f}min')
print(f'estimate trinity file size for full set:\t\t\t\t{(filenumber_allfiles/test_sample_size)*(size_trinity/1000**3):.0f}GB')

bash concatenate worm id, ethanol id array with trinity.dat		0.18min elapsed
-------
total processing time: 							0.28min elapsed
estimate processing time for full set:					29min
estimate trinity file size for full set:				0GB


In [9]:
# test load
col_keep_ind_legend = np.array([0,3,5,6,8,9,10,11,12,13,14,15,16,17])
col_keep_ind_output = np.hstack((np.array([0,1]), col_keep_ind_legend+2))

In [10]:
# make coluumn name
p = os.path.join(dir_save, 'legend_trinity_worm.pickle')
cnames = pickle.load(open(p,'rb'))
cnames = cnames.loc[col_keep_ind_legend,'name'].values
cnames = np.hstack((np.array(['worm_id','ethanol']),cnames)) 

In [11]:
df = pd.read_csv(os.path.join(dir_save, output_filename), 
                 delim_whitespace=True,
                 header=None,
                 usecols=col_keep_ind_output, 
                 names=cnames)
df.info()

ParserError: Too many columns specified: expected 16 and found 2