# Capstone Report | BrainStation
Conny Lin | June 2020

## set up

### functions

In [1]:
# set local path settings based on computer host
import socket
hostname = socket.gethostname().split('.')[0]
if hostname == 'Angular-Gyrus':
    path_py_library = '/Users/connylin/Code/proj/brainstation_capstone'
    if path_py_library not in sys.path:
        sys.path.insert(1, path_py_library)
    
# import libraries
import sys, time, datetime, os, pickle, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# import local libraries
from toolbox import host_paths

# set local variables
LOCAL_PATHS = host_paths.get(hostname)
DATA_DIR = os.path.join(LOCAL_PATHS['Capstone'], 'data')

# report latest run
print(f'last ran on: {datetime.datetime.now()} PT')

getting host computer specific paths
last ran on: 2020-06-28 16:51:12.139399 PT


## Data Acquisition

Data are stored in a .blob data lake scheme. An example of the file structure is illustrated below. 

```
Multiworm tracker data (MWT)/
│
├── experiment folder/
│   ├── group folder (e.g. alcohol or no alcohol)
│   │   ├── plate folder
│   │   │   ├── xxx.blobs
│   │   │   ├── xxx.blobs
│   │   │   ├── xxx.summary
│   │   │   ├── xxx.png
│   │   │   └── xxx.dat
│   │   ├── plate folder
│   │   │   ├── xxx.blobs
│   │   │   ├── xxx.blobs
│   │   │   ├── xxx.summary
│   │   │   ├── xxx.png
│   │   │   └── xxx.dat
│   │   ├── plate folder
│   │   │   └── ...
│   └── alcohol group ...
│       ├── plate folder
│       ├── plate folder
│       ├── plate folder
└──     └── plate folder
```

In [17]:
# check how many experiment folders are in the data lake
n = len(glob.glob('/Volumes/COBOLT/*/20*'))
print(f'data lake has {n} experiment folder')
# check how many group folder
n = len(glob.glob('/Volumes/COBOLT/*/20*/*'))
print(f'data lake has {n} group folder')
# check how many plate folder
n = len(glob.glob('/Volumes/COBOLT/*/20*/*/20*'))
print(f'data lake has {n} plate folder')

data lake has 464 experiment folder
data lake has 2187 group folder
data lake has 7354 plate folder


In [5]:
# update MWT database using a function in toolbox
from toolbox import database

# update datalake database MWTDB.csv
datalake_drive = '/Volumes/COBOLT'
database_filename = os.path.join(datalake_drive, 'MWTDB.csv')
fullpath_to_database_false = False
search_style = 'structured' 

MWTDB = database.updateMWTDB(database_filename, 
                             datalake_drive, 
                             search_style, 
                             fullpath_to_database_false)

Searching for MWT folders in dir: /Volumes/COBOLT

	This will take a while...
		done
		7294 MWT folders found
load existing MWTDB.csv
7981 MWT files found in MWTDB.csv
nothing new to update


Check database to see how many normal and alcohol plates available to extract data from:

# check database to see what data are available

In [1]:
import numpy as np
import pandas as pd

In [55]:
path_db = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data/MWTDB_BS.csv'
path_dbtrimsave = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data/MWTDB_BS_trim.csv'


In [3]:
MWTDB = pd.read_csv(path_db)
MWTDB

Unnamed: 0.1,Unnamed: 0,mwtid,mwtname,mwtpath,expname,exp_date,tracker,expter,groupname,strain,...,postrec,tapN,genotype,note,png_path,shanespark_path,trinity_path,mwtfolder_exist,drunkposture_path,drunkposture2_path
0,0,1,20100114_131538,/Volumes/COBOLT/MWT/20100114X_JC_0s0x0s0s_prac...,20100114X_JC_0s0x0s0s_practice,20100114,X,JC,N2_EE0,N2,...,0,0,wildtype(bristol),,,,,True,,
1,1,2,20100114_135002,/Volumes/COBOLT/MWT/20100114X_JC_0s0x0s0s_prac...,20100114X_JC_0s0x0s0s_practice,20100114,X,JC,N2_EE0,N2,...,0,0,wildtype(bristol),,/Volumes/COBOLT/MWT/20100114X_JC_0s0x0s0s_prac...,,,True,,/Volumes/COBOLT/MWT/20100114X_JC_0s0x0s0s_prac...
2,2,3,20100114_133238,/Volumes/COBOLT/MWT/20100114X_JC_0s0x0s0s_prac...,20100114X_JC_0s0x0s0s_practice,20100114,X,JC,N2_EE10,N2,...,0,0,wildtype(bristol),,,,,True,,
3,3,4,20100114_141301,/Volumes/COBOLT/MWT/20100114X_JC_0s0x0s0s_prac...,20100114X_JC_0s0x0s0s_practice,20100114,X,JC,N2_EE10,N2,...,0,0,wildtype(bristol),,/Volumes/COBOLT/MWT/20100114X_JC_0s0x0s0s_prac...,,,True,,/Volumes/COBOLT/MWT/20100114X_JC_0s0x0s0s_prac...
4,4,5,20100201_131356,/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01...,20100201X_JC_0s0x0s0s_EE01013,20100201,X,JC,N2_EE0,N2,...,0,0,wildtype(bristol),,/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01...,,,True,,/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6347,6347,6348,20160325_121107,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,20160325A_JS_3600s0x0s0s_N2_3day,20160325,A,JS,N2_200mM_3d,N2,...,0,0,wildtype(bristol),,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,,,True,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...
6348,6348,6349,20160325_141310,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,20160325A_JS_3600s0x0s0s_N2_3day,20160325,A,JS,N2_200mM_3d,N2,...,0,0,wildtype(bristol),,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,,,True,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...
6349,6349,6350,20160325_085820,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,20160325A_JS_3600s0x0s0s_N2_3day,20160325,A,JS,N2_3d,N2,...,0,0,wildtype(bristol),,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,,,True,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...
6350,6350,6351,20160325_100600,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,20160325A_JS_3600s0x0s0s_N2_3day,20160325,A,JS,N2_3d,N2,...,0,0,wildtype(bristol),,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,,,True,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...,/Volumes/COBOLT/MWT/20160325A_JS_3600s0x0s0s_N...


In [20]:
# size of db
dbsize = MWTDB.shape[0]
# see how many missing png
miss_png = MWTDB['png_path'].isna().sum()
# see how many mising drunk posture 1 or 2
miss_drunkposture = MWTDB['drunkposture_path'].isna().sum()
miss_drunkposture2 = MWTDB['drunkposture2_path'].isna().sum()
# see how many mssing shanespark
miss_shanespark = MWTDB['shanespark_path'].isna().sum()

In [21]:
# see % of missing data
print(f'database has {dbsize} plates')
print(f'{miss_png} missing png ({(miss_png/dbsize)*100:.2f}%)')
print(f'{miss_drunkposture} missing drunkposture.dat ({(miss_drunkposture/dbsize)*100:.2f}%)')
print(f'{miss_drunkposture2} missing drunkposture2.dat ({(miss_drunkposture2/dbsize)*100:.2f}%)')
print(f'{miss_shanespark} missing shanespark.dat ({(miss_shanespark/dbsize)*100:.2f}%)')

database has 6352 plates
27 missing png (0.43%)
3307 missing drunkposture.dat (52.06%)
1656 missing drunkposture2.dat (26.07%)
806 missing shanespark.dat (12.69%)


In [51]:
# see how many has png, drunkposture 1 or 2 and shanespark

i = ~MWTDB['png_path'].isna() & \
    (~MWTDB['drunkposture_path'].isna() | ~MWTDB['drunkposture2_path'].isna()) & \
    ~MWTDB['shanespark_path'].isna()
n = sum(i)

print(f'{n}/{dbsize} ({(n/dbsize)*100:.2f}%) plates have full data set')

4215/6352 (66.36%) plates have full data set


In [57]:
# trim dataset
MWTDB_trim = MWTDB.loc[i,:].copy()
MWTDB_trim.shape
MWTDB_trim.to_csv(path_dbtrimsave)

In [61]:
# see how many has all the files and are alcohol vs no alcohol
# group name contains "mM" would be no alcohol
i = MWTDB_trim['groupname'].str.find(r'mM') != -1
n_alcohol = sum(i)
n_normal = sum(~i)
print(f'trimmed dataset has {n_normal} normal plates and {n_alcohol} alohol plates')

trimmed dataset has 2470 normal plates and 1745 alohol plates


In [None]:
# if this is not big enough data, can run other 2000 plates to get more data