In [None]:
## ETL trinity for predicting ethanol behavior before first tap
# Conny Lin | June 5, 2020

* ETL_trinity_predictetoh_before_tap : get worm behavior data from trinity_clean.csv
    * only from wildtype
    * only from before first tap
    * only from 400mM and 0mM groups

In [57]:
import pandas as pd
import numpy as np
import os
import sys
import re
# import functions
dir_function_collection = [
'/Users/connylin/Dropbox/Code/language/python_lib/localpackage/mwt',
'/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/brainstation_capstone/0_lib'
]
for p in dir_function_collection:
    sys.path.insert(1, p)
import BrainStationLib as bs

In [None]:
# define local function 
def extract_MWT_feature_preplate(pmwt):
    p_parsed = pmwt.split('/')
    if os.path.isfile(pmwt):
        expname = p_parsed[-4]
    elif os.path.isfile(ptrinity):
        expname = p_parsed[-3]
    a = re.search(r'(?<=_)\d{1,}', expname)
    preplate_time = int(a.group(0))
    return preplate_time

In [106]:
# define local variable
pMWTDB = '/Users/connylin/Dropbox/MWT/db/MWTDB.csv'
pCapstone = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data'

In [8]:
# load MWTDB from dropbox
MWTDB = pd.read_csv(pMWTDB)
# get only N2 and N2_400mM plates
i = MWTDB['groupname'].isin(['N2','N2_400mM'])
print(f'{sum(i)} plates found')
# keep only those plates
MWTDB = MWTDB.loc[i,:].copy()

2008 plates found


In [9]:
MWTDB.shape

(2008, 5)

In [88]:
# get trinity paths for all and check how many exist
ptrinity_list = list(map(lambda x: os.path.join(x,'trinity_cleaned.csv'), 
                         MWTDB['mwtpath']))
ptrinity_exist = list(map(lambda x: os.path.isfile(x), ptrinity_list))
print(f'{sum(ptrinity_exist)} trinity_cleaned.csv exist')
ptrinity_list = np.array(ptrinity_list)
ptrinity_list = ptrinity_list[ptrinity_exist].copy()
nfiles = ptrinity_list.shape[0]
print(nfiles)

261 trinity_cleaned.csv exist
261


In [89]:
# get only data before preplate (first tap)
df = pd.read_csv(ptrinity_list[0])
# get preplate time
preplate_time = extract_MWT_feature_preplate(pmwt)
# reduce data
df = df.loc[df['time'] < preplate_time,:].copy()
# get sizse
bytesize = sys.getsizeof(df)
# estimate total size and see if can concat
print(f'estimated total {bytesize/(1000**2)*nfiles:.1f} MB')

estimated total 571.7 MB


In [104]:
# look into each trinity_clean.csv
df_collection = []
for i, ptrinity in enumerate(ptrinity_list):
    print(f'processing {i} file', end='\r')
    # get only data before preplate (first tap)
    df = pd.read_csv(ptrinity)
    # get preplate time
    preplate_time = extract_MWT_feature_preplate(pmwt)
    # get only sub set
    df = df.loc[df['time'] < preplate_time,:].copy()
    # append
    df_collection.append(df)
df_collection = pd.concat(df_collection, ignore_index=True)

processing 260 file

In [105]:
df_collection.shape

(11413375, 17)

In [110]:
# get sizse
bytesize = sys.getsizeof(df_collection)
# estimate total size and see if can concat
print(f'estimated total {bytesize/(1000**3):.1f} GB')

estimated total 1.6 GB


In [112]:
# save
df_collection.to_csv(os.path.join(pCapstone, 'trinity_cleaned_N2_etoh_b4tap.csv'))

In [113]:
df_collection

Unnamed: 0,mwtid,etoh,time,speed,bias,tap,loc_x,loc_y,morphwidth,midline,area,angular,aspect,kink,curve,crab,wormid
0,17,1,1.0,1.0,0.0,1.0,0.0,16.2008,21.3477,0.1037,1.2937,0.164025,4.6,0.122,11.3,11.3,
1,17,1,1.0,1.0,0.0,1.0,0.0,16.2237,21.3484,0.1041,1.2947,0.167670,4.0,0.124,7.9,11.4,
2,17,1,1.0,1.0,0.0,1.0,0.0,16.2465,21.3505,0.1034,1.3011,0.166941,2.4,0.106,4.4,12.2,
3,17,1,1.0,1.0,0.0,1.0,0.0,16.2667,21.3508,0.1038,1.2984,0.165483,1.5,0.106,8.2,14.2,
4,17,1,1.0,1.0,0.0,1.0,0.0,16.2869,21.3517,0.1029,1.3325,0.164754,1.1,0.106,12.5,14.2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11413370,975,0,98.0,1.0,0.0,1.0,0.0,29.4001,48.7551,0.0932,0.4305,0.054675,2.4,0.260,19.5,8.2,
11413371,975,0,98.0,1.0,0.0,1.0,0.0,29.4002,48.7621,0.0917,0.4401,0.051759,1.2,0.135,4.7,3.8,
11413372,975,0,98.0,1.0,0.0,1.0,0.0,29.3963,48.7737,0.0866,0.4179,0.046656,5.9,0.164,19.0,7.1,
11413373,975,0,98.0,1.0,0.0,1.0,0.0,29.3928,48.7798,0.0890,0.3925,0.045927,8.7,0.206,7.4,2.8,
