# Parsing PlateMicroscopy directory

__October 2019__

This notebook organizes the parsing and validation of the existing `'PlateMicroscopy'` directory. 

This directory contains all raw and processed pipeline microscopy image data from Plates 1-19 and thawed Plates 1-5. This data was acquired between October 2018 and August 2019. 

In [None]:
import os
import re
import sys
import glob
import json
import dask
import shutil
import pickle
import hashlib
import skimage
import datetime
import tifffile
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
sys.path.append('../')
from pipeline_process.imaging import plate_microscopy_api, utils, viz

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Flexo 'PlateMicroscopy' directory
flexo_root = '/Volumes/MicroscopyData/ML_group/Plate_Microscopy/'
os.path.isdir(flexo_root)

In [None]:
# ESS 'PlateMicroscopy' directory
ess_root = '/Volumes/ml_group/PlateMicroscopy/'
os.path.isdir(ess_root)

In [None]:
# ESS on cap
ess_root = '/gpfsML/ML_group/PlateMicroscopy/'

### ML experiment metadata

In [None]:
exp_md = pd.read_csv('/Users/keith.cheveralls/Downloads/Microscopy-Master-Key.csv')
exp_md = exp_md.rename(columns={c: c.replace(' ', '_').lower() for c in exp_md.columns})
exp_md = exp_md.drop(labels=[c for c in exp_md.columns if c.startswith('unnamed')], axis=1)

In [None]:
exp_md.loc[exp_md.id.isin(d_raw.exp_dir)]

In [None]:
# check that all exp_ids appear in the exp metadata
set(d_raw.exp_dir).difference(exp_md.id)

### Instance of a plateMicroscopy API

In [None]:
api = plate_microscopy_api.PlateMicroscopyAPI(ess_root, '../plate-microscopy-cache/20191025-ess/')

In [None]:
len(api.os_walk), api.md.shape, api.md.is_raw.sum()

### Construct metadata dataframe from the os.walk results

In [None]:
# api.cache_os_walk()
api.construct_metadata()
api.md.shape

In [None]:
api.append_file_info()

In [None]:
api.cache_metadata(overwrite=)

### Basic sanity checks

In [None]:
d = api.md.copy()

In [None]:
# number of targets
d.target_name.unique().shape, 19 * (96-4)

In [None]:
# unique plate numbers
sorted(map(int, d.plate_num.unique()))

In [None]:
# unparsable filenames
d.loc[d.target_name==''].groupby('plate_dir').count()

In [None]:
# Filenames in 'Jin' format
d.loc[d.target_name=='Jin'].groupby('plate_dir').count()

In [None]:
d.loc[(d.target_name=='Jin') & (d.plate_num==7)]

In [None]:
d.loc[(d.well_id=='E6') & (d.plate_num==6)]

In [None]:
# count exp_subdirs in each exp_dir (requires paths_only=True)
d.groupby('exp_dir').count().sort_values(by='exp_subdir', ascending=False)

In [None]:
# paths without an exp_subdir (requires paths_only=True)
d.loc[(d.exp_subdir.apply(len) == 0)]

In [None]:
# there's one 'temp' exp_dir - in thawed plate3
d.loc[(d.exp_dir=='temp')]

In [None]:
# example of logical indexing that uses startswith
d.loc[(d.plate_dir=='mNG96wp5') & (d.exp_dir.apply(lambda s: s.startswith('Pub')))]

In [None]:
# unusual experiment directory names (not beginning with ML0, Publication, or Updated)
d.loc[d.exp_dir.apply(lambda s: re.match('^ML0|Publication|Updated', s) is None)].copy()

### Raw data sanity checks

In [None]:
d_raw = api.md.loc[api.md.is_raw]
d_raw.shape

In [None]:
d_raw.groupby(['target_name', 'well_id']).nunique().sort_values(by='plate_num', ascending=False).iloc[:10]

In [None]:
d_raw.groupby(['target_name', 'plate_dir']).nunique().sort_values(by='well_id', ascending=False).iloc[:10]

In [None]:
# IMPAD1 appears on plate2 and plate6 (in all lists)
d_raw.loc[d_raw.target_name=='IMPAD1']

In [None]:
# unique exp_ids
sorted(d_raw.exp_id.unique())[:10]

In [None]:
# unique exp_ids
len(set(d_raw.exp_dir)), len(set(d_raw.exp_id))

In [None]:
# unique exp subdirs
d_raw.exp_subdir.unique()

In [None]:
# construct unique file_id by concatenating the exp_id and the filename
file_ids = ['P%04d_%s_%s' % (row.plate_num, row.exp_id, row.filename) for ind, row in d_raw.iterrows()]

In [None]:
# check that this file_id is unique
len(file_ids), len(set(file_ids))

In [None]:
file_ids[:10]

In [None]:
# check that there's never more than one exp_subdir in an exp_dir
d_raw.groupby(['exp_dir']).nunique().max()

In [None]:
# check that every exp_dir appears in only one plate_dir
exp_dirs = {}
for ind, row in d_raw.iterrows():
    plate_dir = exp_dirs.get(row.exp_dir)
    if plate_dir is None:
        exp_dirs[row.exp_dir] = row.plate_dir
        continue
    if plate_dir != row.plate_dir:
        print(row.exp_dir)

In [None]:
# count raw FOVs per plate
d_raw.groupby('plate_dir').count().sort_values(by='plate_dir', ascending=False)

In [None]:
# count raw FOVs per target
d_raw.groupby('target_name').count().sort_values(by='filename', ascending=False)

In [None]:
# distribution of FOV counts per target
_ = plt.hist(d_raw.groupby('target_name').count().filename, bins=np.arange(0, 40))

In [None]:
# total raw data size by plate in gigabytes
(d_raw.groupby('plate_num').filesize.sum().sort_values(ascending=False)/1024/1024/1024).astype(int)

In [None]:
# total data size and total raw data size in terabytes
d.filesize.sum()/1e12, d_raw.filesize.sum()/1e12

In [None]:
_ = plt.hist(d.filesize/1e9, bins=np.arange(0, .6, .01))

### Designing the directory structure for a new pipeline microscopy data directory

In [None]:
# plate directory naming scheme
'{master_line}-{plate_design_id}-{plate_instance}-R{imaging_round}'

# plate1, first electroporation, imaged without freezing (Round 0)
'mNG-P0001-E01-R00'

# Plate1, first electroporation, imaged again after freeze-thaw (Round 1)
'mNG-P0001-E01-R01'

# a re-sorted plate or a second electroporation, first time imaged
'mNG-P0001-E02-R00'

# clones from plate1
'mNG-P0001-E01-CLONES-R00'

In [None]:
# Raw stack (as a row in the metadata dataframe)
'mNG96wp19, ML0137_20190528, mNG96wp19_sortday1, A9_1_BAG6.ome.tif'

# Rename stack `{exp_id}_{plate_id}-{well_id}-{fov_num}-{target_name}.tif`
'ML0137_mNG-P0019-E01-R01_A09-01-BAG6.tif'

# Make raw (uint16) projections (for FOV scoring)
'ML0137_P0019-A09-01-BAG6_DAPI-PROJX.tif'
'ML0137_P0019-A09-01-BAG6_DAPI-PROJY.tif'
'ML0137_P0019-A09-01-BAG6_DAPI-PROJZ.tif'

# Make autoscaled (uint8) projections (for convenient manual inspection)
# need to pick a percentile threshold ('AS99' stands for `autoscale(im, percentile=99)`)
'ML0137_P0019-A09-01-BAG6_GFP-PROJX-AS99-UINT8.tif'


# --- FOV scoring and selection --- #
# (using a trained regression model)

# crop in z around the cell layer
'ML0137_P0019-A09-01-BAG6_CROPZ.tif'

# Make 2x-downsampled and autoscaled (uint8) stacks 
# (for convenient manual inspection; these will be 8x smaller) 
'ML0137_P0019-A09-01-BAG6_CROPZ-2XDS-AS99-UINT8.tif'


# --- 600x600 ROI scoring and selection (for the website) --- #
# (score 'all' possible ROIs created from the z-projections)
# (requires somehow training an ROI regression model)

# Crop and autoscale the best-scoring ROIs 
'ML0137_P0019-A09-01-BAG6_CROPZ-CROPXY-01-AS99-UINT8.tif'
'ML0137_P0019-A09-01-BAG6_CROPZ-CROPXY-02-AS99-UINT8.tif'
'...'

# Create tiled PNGs from the cropped stacks
'ML0137_P0019-A09-01-BAG6_CROPZ-CROPXY-01-AS99-UINT8-DAPI-TILE.png'
'ML0137_P0019-A09-01-BAG6_CROPZ-CROPXY-01-AS99-UINT8-GFP-TILE.png'

In [None]:
# directory structure
# general principle: the plate directories should be the deepest directory level

# organize stacks into plate and experiment subdirectories to make moving/copying the data easier
# do not organize projections into subdirectories - to make visual inspection easier

# Future considerations - what could change and what other dimensions we should anticipate
#
# - thawing and imaging the same plate multiple times -> a plate directory for each time - 'P0001-01', 'P0001-02'
# - electroporating the same plate design/instance multiple times
# - electroporating the same plate with different parental cell lines
# - 