# Construct MWT DB

In [39]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from pathlib import PurePath
import pathlib
import glob
import pickle

In [251]:
dir_drive = '/Volumes/COBOLT'
dir_save = '/Users/connylin/Dropbox/CA/ED _20200119 Brain Station Data Science Diploma/Capstone/data'

get files under all MWT folders in dir_drive

example path: /Volumes/COBOLT/MWT/20190418X_XX_100s30x10s10s_slo1/VG903_400mM/20190418_141335/VG903_OH_15x3_t96h20C_100s30x10s10s_A_0418_jv410014.png

parse the file names into:
* extension
* filename prefix
* filename suffix (e.g. shanespark)
* mwt name
* group name
* expname
    * exp date
    * tracker
    * experimenter
    * exp condition
        * pre-plate
        * taps
        * ISI
        * post-tap
    * exp name tag
* MWT DB source (e.g. MWT, MWT bad)

In [16]:
# get files under all MWT folders in dir_drive
p = Path(dir_drive)

In [18]:
[x for x in p.iterdir() if x.is_dir()]

[PosixPath('/Volumes/COBOLT/MWT'),
 PosixPath('/Volumes/COBOLT/.Trashes'),
 PosixPath('/Volumes/COBOLT/.fseventsd'),
 PosixPath('/Volumes/COBOLT/.Spotlight-V100'),
 PosixPath('/Volumes/COBOLT/.TemporaryItems'),
 PosixPath('/Volumes/COBOLT/.bzvol'),
 PosixPath('/Volumes/COBOLT/MWT DISCARD'),
 PosixPath('/Volumes/COBOLT/MWT_New'),
 PosixPath('/Volumes/COBOLT/RL Pub PhD Dissertation'),
 PosixPath('/Volumes/COBOLT/MWT_edit'),
 PosixPath('/Volumes/COBOLT/MWT to check')]

In [31]:
# get all MWT folders
mwt_folders = glob.glob('/Volumes/COBOLT/MWT*')

**Note**: glob [documentation](https://docs.python.org/3/library/glob.html)

In [35]:
allfiles = glob.glob('/Volumes/COBOLT/MWT*/*/*/*/*')


['/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH.dat',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH.png',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH.summary',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH.evan.dat',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH.drunkposture2.dat',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH_00000k.blobs',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH_00001k.blobs',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH_every60sec.trig',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH_s1.trig',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE01013/N2_EE0/20100201_131356/0%_EtOH_s5.trig',
 '/Volumes/COBOLT/MWT/20100201X_JC_0s0x0s0s_EE010

save this in pickle file: https://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/

to load:

`with open(filename, ‘rb’) as f:`

`var_you_want_to_load_into = pickle.load(f)`

In [252]:
path_save = os.path.join(dir_save, 'allfilepaths.pickle')
with open(path_save,'wb') as f:
    pickle.dump(allfiles,f)

In [37]:
# take a look at the results I've got
len(allfiles)
# 1,318,724 files

1318724

## Create database (seems like a lot of memory - maybe just search it for later)

* use `PurePath.parts` to access all parts see doc: https://docs.python.org/3/library/pathlib.html
* apply to all paths using `map`. See doc: https://chrisalbon.com/python/basics/applying_functions_to_list_items/
    * example:`regimentNamesCapitalized_m = list(map(capitalizer, regimentNames)); regimentNamesCapitalized_m`
* glob: https://docs.python.org/3/library/glob.html

In [None]:
# get paths to .png
# test apply all with map
path_allfiles_array = np.array(allfiles)
# create paths objects
path_obj = list(map(PurePath,path_allfiles_array))
# get extension 
file_ext = np.array(list(map(lambda x: x.suffix, path_obj)))
# get extension = png
path_png = path_allfiles_array[file_ext == '.png']

In [265]:
print(f'{len(path_png)} png files') # get how many png

8883 png files


In [266]:
# make sure there aren't any duplicates
len(set(path_png))

8883

In [268]:
# parse just the png files
test_paths = path_png
# create paths objects
path_obj = list(map(PurePath,test_paths))

In [277]:
# get parts
path_parts = list(map(lambda x: np.array(x.parts),path_obj))
# get length of each parts
part_numbers = list(map(lambda x: len(x),path_parts))
# check which paths has 8 parts
part_number_is_8 = np.array(part_numbers) == 8
# convert path parts to numpy array
file_parts_nparray = np.array(path_parts)
# get only file parts with 8 parts into dataframe
df = pd.DataFrame(file_parts_nparray[part_number_is_8], 
             columns=['remove','volume','drivename','dbname','expname','groupname','mwtname','filename'])
# add absolute path
test_paths_array = np.array(test_paths)
df['path'] = test_paths_array[part_number_is_8]
# add file name
file_names = list(map(lambda x: x.stem, path_obj))
df['fname'] = file_names
# add extension name
df['ext']  = list(map(lambda x: x.suffix, path_obj))
# search for suffixes
df['analysis_tag'] = list(map(lambda x: np.setdiff1d(x.suffixes, x.suffix), path_obj))
# remove unnecesary
df.drop(['remove','volume','drivename'],axis=1,inplace=True)


In [275]:
# save png file dataframe
path_save = os.path.join(dir_save, 'path_png.pickle')
with open(path_save,'wb') as f:
    pickle.dump(df,f)

In [None]:
# experimenter
# tracker
# expdate
# preplate
# tap
# ISI
# posttap
# exp_description

In [276]:
# get only file parts with 8 parts into dataframe
