In [1]:
## ETL nutcracker
# Conny Lin | June 6, 2020
# transform data from raw to ML ready data
# code below have been added to BrainStationLib.py
# also saved in ETL_nutcracker.py

In [2]:
# local variable setting
pCapstone = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data'
pDropboxdb = '/Users/connylin/Dropbox/MWT/db'
pCobolt = '/Volumes/COBOLT'
mwtpath_csv_name_cobolt = 'mwtpath_cobolt.csv'
mwtpath_csv_name_dropbox = 'mwtpath_dropbox.csv'

# UPDATE THESE SETTINGS
sourcedir_db = pDropboxdb
savedir_db = pDropboxdb
savedir = pCapstone
mwtpath_csv_name = mwtpath_csv_name_dropbox

In [3]:
# import libraries
import os, sys, glob, pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# import local functions
sys.path.insert(1, '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/brainstation_capstone/0_lib')
import BrainStationLib as bs

In [4]:
pathcsv = os.path.join(pCapstone, mwtpath_csv_name)
if os.path.isfile(pathcsv):
    print(f'loading mwtpath.csv from \n\t{pathcsv}')
    df = pd.read_csv(pathcsv)
    mwtpaths = df['mwtpath'].values
else:
    mwtpaths = glob.glob(sourcedir_db+'/*/*/*/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9]')
    print(f'saving mwtpath found to \n\t{pathcsv}')
    df = pd.DataFrame({'mwtpath':mwtpaths})
    df.to_csv(pathcsv)
print(f'{len(mwtpaths)} mwt folders found')

loading mwtpath.csv from 
	/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data/mwtpath_dropbox.csv
7294 mwt folders found


In [5]:
# combine individual nutcracker per plate
_, nutcracker_filepaths = bs.nutcracker_process_perplate(mwtpaths, sourcedir_db, savedir_db)

/Users/connylin/Dropbox/MWT/db/MWT/20111114C_CL_100s30x10s10s/N2_400mM/20111114_141722
	processing 192
	66924 rows
	saved nutcracker_100s.csv
/Users/connylin/Dropbox/MWT/db/MWT/20111114C_CL_100s30x10s10s/N2/20111114_142532
	processing 295
	83976 rows
	saved nutcracker_100s.csv
/Users/connylin/Dropbox/MWT/db/MWT/20111114C_CL_100s30x10s10s/N2/20111114_140917
	processing 143
	89038 rows
	saved nutcracker_100s.csv


In [6]:
# combne all combined nutcracker data from each plate (memory intensive!)
data = bs.nutcracker_combineall(nutcracker_filepaths)

In [7]:
# split into X y data set and save to dropbox (large file! 20GB expected)
bs.nutcracker_split_Xy(data, savedir)

saving done


In [9]:
print('\there:'+savedir)

	here:/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data


In [8]:
assert False, 'code end here'
# below are development notes

AssertionError: code end here

In [None]:
# make legend
pchorelegend = os.path.join(pCapstone, 'legend_choreography.csv')
chorjavacall = 'tDfpemMwWlLaAkcsSbpdxyuvor1234'
chorelegend = bs.make_chor_output_legend(pchorelegend, chorjavacall)

In [None]:
len(chorjavacall)

Persistence data is duplicated. 

In [None]:
chorelegend['name']

In [None]:
# preselect columns to load
column_index_keep = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,22,23,24,25]
column_names = chorelegend['name'][column_index_keep]

In [None]:
# TODO: look for data in db
# TODO: change pDropboxdb to pCobolt
mwtpaths_db = glob.glob(pDropboxdb+'/*/*/*/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9]')
print(f'{len(mwtpaths_db)} mwt folders found')
# save this
df = pd.DataFrame({'mwtpath':mwtpaths_db})
df.to_csv(os.path.join(pCapstone, 'mwtpath.csv'))

In [None]:
def nutcracker_process_rawdata(pdata, mwtid):
    column_names_raw = ['time','id','frame','persistence','area','midline','morphwidth',
                        'width','relwidth','length','rellength','aspect','relaspect',
                        'kink','curve','speed','angular','bias','persistence','dir',
                        'loc_x','loc_y','vel_x','vel_y','orient','crab','tap','puff',
                        'stim3','stim4']  
    column_index_keep = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,22,23,24,25]
    # load data put in data frame
    df = pd.read_csv(pdata, delimiter=' ', header=None, usecols=column_index_keep, 
                     names=column_names, dtype=np.float64, engine='c')
    # remove data before 100s
    df.drop(axis=0, index=df.index[df['time']>100], inplace=True)
    # remove nan
    df.dropna(axis=0, inplace=True)
    # add mwtid column
    df.insert(0,'mwtid', np.tile(mwtid, df.shape[0]))
    # add etoh column
    if ('/N2_400mM/' in pdata):
        df.insert(0,'etoh', np.tile(1, df.shape[0]))
    else:
        df.insert(0,'etoh', np.tile(0, df.shape[0]))
    return df

In [None]:
def nutcracker_pick_consolidate_data(mwtpaths_db, pCobolt, pDropboxdb):
    # look for nutcracker files in this plate
    nutcracker_filelist = []
    for imwt, pmwt in enumerate(mwtpaths_db):
        pnutcracker = glob.glob(pmwt+'/*.nutcracker.*.dat')
        if len(pnutcracker) > 0:
            print(pmwt)
            # make storage for df
            df_store = []
            for ifile, pdata in enumerate(pnutcracker):
                print(f'\tprocessing {ifile}', end='\r')
                # get time data
                df = pd.read_csv(pdata, delimiter=' ', header=None, usecols=[0], 
                                 names=['time'], dtype=np.float64, engine='c')
                # see if data has time before 100s
                if sum(df['time']<100) > 0:
                    df = nutcracker_process_rawdata(pdata, imwt)
                    # add df to storage
                    df_store.append(df)
            # combine multiple nutcracker files (just before tap and only non NAN)
            df_mwt = pd.concat(df_store, ignore_index=True)
            print(f'\n\t{df_mwt.shape[0]} rows')
            # add etoh column

            # save csv in dropbox
            pmwt_dropbox = str.replace(pmwt, pCobolt, pDropboxdb)
            pdata_save_dropbox = os.path.join(pmwt_dropbox, 'nutcracker_100s.csv')
            nutcracker_filelist.append(pdata_save_dropbox)
            df_mwt.to_csv(pdata_save_dropbox, index=False)
            print(f'\tsaved nutcracker_100s.csv')
    return df_mwt, nutcracker_filelist

estimate data size: 33.1*1500/1000 = 50GB - more reasonable

In [None]:
df_mwt, nutcracker_filelist = nutcracker_pick_consolidate_data(mwtpaths_db, pCobolt, pDropboxdb)

In [None]:
# estimate number of rows at the end
print('estimate of total rows without data reduction')
print(int((239938 / 3 * 1400) * (5*60/90) * (8000/1400)))
print('estimate of total rows with data reduction')
print(int((239938 / 3 * 1400)))


In [None]:
def nutcracker_combine(nutcracker_filepaths)
    # load and combine nutcracker_filelist
    df_store = []
    for filepath in nutcracker_filepaths:
        df_store.append(pd.read_csv(filepath, dtype=np.float64, engine='c'))
    data = pd.concat(df_store, ignore_index=True)
    return data

data = nutcracker_combine(nutcracker_filelist)

In [None]:
data.info()
size_file = int(sys.getsizeof(data)/1000**3/len(df_store)*1400)
print(f'estimate end size {size_file:.2f} GB')
del df_store

In [None]:
# EDA
data.describe().transpose()

In [None]:
def nutcracker_split_Xy(data, dir_save)
    # split X/y
    # y column
    y_column = ['etoh']
    y = data[y_column].copy()
    data.drop(columns=y_column, inplace=True)
    y.to_csv(os.path.join(dir_save, 'nutcracker_y.csv'), index=False)
    # identifier column
    identifier_column = ['id','mwtid']
    data_identifiers = data[identifier_column].copy()
    data.drop(columns=identifier_column, inplace=True)
    data_identifiers.to_csv(os.path.join(dir_save, 'nutcracker_identifier.csv'), index=False)
    # save X
    data.to_csv(os.path.join(dir_save, 'nutcracker_X.csv'), index=False)
    print('saving done')
    