In [1]:
# ETL trinity data
# Conny Lin | June 4, 2020

# prepare the data into machine learning ready format.
# Extract data from individual plates, label with plate index in the plate db, and group name.

In [1]:
# load library
import os
import pickle
import numpy as np
import pandas as pd
import subprocess
import glob

In [2]:
# define global varialbes/paths
dir_save = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data'

# make data db

In [5]:
# load file pickle
plateDB = pickle.load(open(os.path.join(dir_save, 'file_summary_mwt.pickle'),'rb'))
# get paths with trinity.id.dat
pMWT = plateDB.index[~plateDB[('filepath','trinity.id.dat')].isna()].values
del plateDB

In [27]:
# make dataframe
MWTDB = pd.DataFrame({'mwtpath':pMWT})

In [30]:
# take a look at the db to see if any missing trinity pickle 
# instantiate 
report_capture = np.zeros(len(pMWT),dtype='bool')
for plateid, pPlate in enumerate(MWTDB['mwtpath']):
    # get expected apth to trinity data
    pfile = os.path.join(pPlate, 'trinity_all_worms.pickle')
    # see if file exist
    if os.path.exists(pfile):
        report_capture[plateid] = True
    else:
        print(f'{plateid} does not exist', end='\r')

# report result
print(f'{np.sum(report_capture)}/{len(report_capture)} files exist')

870/871 files exist


In [34]:
# delete the plate that failed to concatenate trinity
MWTDB.drop(index=MWTDB.index[~report_capture].values, inplace=True)

# Create MWTDB

In [42]:
# add paths to trinitu files
MWTDB['trinity_path'] = list(map(lambda x: os.path.join(x,'trinity_all_worms.pickle'), MWTDB['mwtpath']))

In [62]:
# reset index
MWTDB.reset_index(drop=True, inplace=True)

In [85]:
# extract experiment features
df = MWTDB['mwtpath'].str.split(pat='/', expand=True)
MWTDB['expname'] = df.iloc[:,4]
MWTDB['groupname'] = df.iloc[:,5]
MWTDB['platename'] = df.iloc[:,6]


In [None]:
# get number of rows per pickle file
# note some trinity files may not be converted to pickle files. Instead of checking availability, 
# random choose 1.1M numbers and then use only first 1M rows that has files
pickle_rows = np.zeros(MWTDB.shape[0], dtype='int')
for i, p in enumerate(MWTDB['trinity_path']):
    if i%5==0:
        print(f'getting row numbers from {i}th file', end='\r')
    df = pd.read_pickle(p)
    pickle_rows[i] = df.shape[0]
MWTDB['rows'] = pickle_rows

In [None]:
# define dropbox save folder, mkdir if not exist
pDropbox_home = '/Users/connylin/Dropbox/MWT/db'
pReplace = '/Volumes/COBOLT'
# replace path 
MWTDB['mwtpath_dropbox'] = list(map(lambda p: p.replace(pReplace, pDropbox_home), MWTDB['mwtpath']))
MWTDB['trinity_path_dropbox'] = list(map(lambda p: p.replace(pReplace, pDropbox_home), MWTDB['trinity_path']))

In [86]:
# save database
pickle.dump(MWTDB, open(os.path.join(dir_save, 'MWTDB_trinity_N2400mM.pickle'),'wb'))

# Data wrangling

## add labels to individual plate data

In [109]:
# take a sample to see if need per file processing
for ind in MWTDB.index.values:
    # get path
    ptrinity = MWTDB['trinity_path'].iloc[ind]
    # load to dataframe
    df = pickle.load(open(ptrinity,'rb'))
    row_n_original = df.shape[0]
    # check if the data already been cleaned
    if any(df.columns=='mwtid_trdb'):
        continue
    # clean nan data
    df.dropna(axis=0, inplace=True)
    row_n_after = df.shape[0]
    print(f'plateid {ind} dropped {row_n_original - row_n_after} rows to {row_n_after} rows', end='\r')
    # add file path 
    df.insert(0,'mwtid_trdb', np.tile(MWTDB.index[ind], df.shape[0]))
    # add group id (ethanol=1 vs no ethanol=0)
    if MWTDB['groupname'][ind]=='N2':
        df.insert(1,'etoh', np.tile(0, df.shape[0]))
    else:
        df.insert(1,'etoh', np.tile(1, df.shape[0]))
    # save the file
    pickle.dump(df, open(ptrinity,'wb'))
    

plateid 869 dropped 7112 rows to 162496 rowssss

## concat all trinity data
https://stackoverflow.com/questions/56012595/how-to-pickle-multiple-pandas-dataframes-and-concatenate-all-of-them-i

```
df = pd.concat([pd.read_pickle('/PATH/df/{}/{}.F.K.df'.format('train', f)).iloc[:, :100] 
                for f in Files], 
               axis=1)
```

`a = [pd.read_pickle(p) for p in MWTDB['trinity_path'][:10]]`

Issues:

* Each csv is ~100MB * 800 = 80GB csv. My computer won't be able to open this file. 


# Can I predict which tap number the worm is reacting to by it's behavior before and after the tap?
* for wildtype
* for ethanol vs non ethanol
* for mutants?

In [24]:
# look at  behavior and see if can predict which tap it is
MWTDB = pickle.load(open(os.path.join(dir_save, 'MWTDB_trinity_N2400mM.pickle'),'rb'))

## approach 1 random 10 plates

In [25]:
# choose 10 plates of 0mM and 10 plates of 400mM to look at
np.random.seed(318)
ind_0mM = np.random.choice(MWTDB.index[MWTDB['groupname']=='N2'].values, 10, replace=False)
ind_400mM = np.random.choice(MWTDB.index[MWTDB['groupname']=='N2_400mM'].values, 10, replace=False)
# combine index from 0mM and 400mM
i = np.hstack((ind_0mM, ind_400mM))
# get trininty file paths from random samples
ptrinity = MWTDB['trinity_path'].iloc[i].values

In [36]:
# load data
df = pd.concat([pd.read_pickle(p) for p in ptrinity])

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11894033 entries, 1 to 1579776
Data columns (total 17 columns):
 #   Column      Dtype  
---  ------      -----  
 0   mwtid_trdb  int64  
 1   etoh        int64  
 2   time        float64
 3   speed       float64
 4   bias        float64
 5   tap         float64
 6   loc_x       float64
 7   loc_y       float64
 8   morphwidth  float64
 9   midline     float64
 10  area        float64
 11  angular     float64
 12  aspect      float64
 13  kink        float64
 14  curve       float64
 15  crab        float64
 16  wormid      int64  
dtypes: float64(14), int64(3)
memory usage: 1.6 GB


## approach 2, random 1 million rows from each group

* 20 plates gives 11,894,033 rows of data. 800/20 = 40*12M = 480M rows of data
* 300s*20 frame per sec = 6000 time points. 1M rows would have 1000/6 = 500/3 = 166 samples per time point. Will start with this and see how it goes.

In [48]:
# get number of rows per trinity file
df = pd.read_pickle(os.path.join(dir_save, 'fileinfo_trinity_N2400mM.pickle'))

In [48]:
# get sum
row_total = df['row_number'].sum()
print(f'total number of rows: {row_total}')
# randomly choose between those numbers
# get the data

total number of rows: 202718463


In [49]:
MWTDB['trinity_path']

0      /Volumes/COBOLT/MWT/20110906B_CL_100s30x60s10s...
1      /Volumes/COBOLT/MWT/20110906B_CL_100s30x60s10s...
2      /Volumes/COBOLT/MWT/20110906B_CL_100s30x60s10s...
3      /Volumes/COBOLT/MWT/20110907B_SS_100s30x10s10s...
4      /Volumes/COBOLT/MWT/20110907B_SS_100s30x10s10s...
                             ...                        
865    /Volumes/COBOLT/MWT/20170605X_CR_100s30x10s10s...
866    /Volumes/COBOLT/MWT/20170605X_CR_100s30x10s10s...
867    /Volumes/COBOLT/MWT/20170605X_CR_100s30x10s10s...
868    /Volumes/COBOLT/MWT/20170605X_CR_100s30x10s10s...
869    /Volumes/COBOLT/MWT/20170605X_CR_100s30x10s10s...
Name: trinity_path, Length: 870, dtype: object