# Parsing PlateMicroscopy directory

__October 2019__

This notebook organizes the parsing and validation of the existing `'PlateMicroscopy'` directory. 

This directory contains all raw and processed pipeline microscopy image data from Plates 1-19 and thawed Plates 1-5. This data was acquired between October 2018 and August 2019. 

In [None]:
import os
import re
import sys
import glob
import json
import dask
import shutil
import pickle
import hashlib
import skimage
import datetime
import tifffile
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
sys.path.append('../')
from opencell import file_utils
from opencell.imaging import utils, viz
from opencell.imaging.managers import PlateMicroscopyManager

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Flexo 'PlateMicroscopy' directory
flexo_root = '/Volumes/MicroscopyData/ML_group/Plate_Microscopy/'
os.path.isdir(flexo_root)

In [None]:
# ESS 'PlateMicroscopy' directory
ess_root = '/Volumes/ml_group/PlateMicroscopy/'
os.path.isdir(ess_root)

### ML experiment metadata

In [None]:
exp_md = file_utils.load_microscopy_master_key()
exp_md.head()

In [None]:
# check that all exp_ids appear in the exp metadata
set(d_raw.exp_id).difference(exp_md.id)

### Create PlateMicroscopy Manager

In [None]:
pm = PlateMicroscopyManager(ess_root, '../plate-microscopy-cache/20191114-ess/')

In [None]:
pm.cache_os_walk()

In [None]:
pm.construct_metadata()
# pm.append_file_info()
pm.construct_raw_metadata()
pm.cache_metadata(overwrite=True)

In [None]:
len(pm.os_walk), pm.md.shape[0], pm.md.is_raw.sum(), pm.md_raw.shape[0]

### Instance of a PlateMicroscopyManager from cache

In [None]:
pm = PlateMicroscopyManager(cache_dir='../plate-microscopy-cache/20191114-ess/')
len(pm.os_walk), pm.md.shape[0], pm.md.is_raw.sum(), pm.md_raw.shape[0]

In [None]:
pm.construct_metadata()

### Basic sanity checks

In [None]:
d = pm.md.copy()

In [None]:
# number of targets
d.target_name.unique().shape, 19 * (96-4)

In [None]:
# unique plate numbers
sorted(map(int, d.plate_num.unique()))

In [None]:
# unparsable filenames
d.loc[d.target_name==''].groupby('plate_dir').count()

In [None]:
# count exp_subdirs in each exp_dir (requires paths_only=True)
d.groupby('exp_dir').count().sort_values(by='exp_subdir', ascending=False)

In [None]:
# paths without an exp_subdir (requires paths_only=True)
d.loc[(d.exp_subdir.apply(len) == 0)]

In [None]:
# there's one 'temp' exp_dir - in thawed plate3
d.loc[(d.exp_dir=='temp')]

In [None]:
# example of logical indexing that uses startswith
d.loc[(d.plate_dir=='mNG96wp5') & (d.exp_dir.apply(lambda s: s.startswith('Pub')))]

In [None]:
# unusual experiment directory names (not beginning with ML0, Publication, or Updated)
d.loc[d.exp_dir.apply(lambda s: re.match('^ML0|Publication|Updated', s) is None)].copy()

In [None]:
d.loc[d.exp_dir.apply(lambda s: s.endswith('Quality'))].groupby(['plate_dir', 'exp_dir']).count()

### Raw data sanity checks

In [None]:
d = pm.md.copy()
d_raw = pm.md_raw.copy()
d_raw.columns

In [None]:
# unique exp_ids
sorted(d_raw.exp_id.unique())[:10]

In [None]:
# unique exp_ids
len(set(d_raw.exp_dir)), len(set(d_raw.exp_id))

In [None]:
# unique exp subdirs
d_raw.exp_subdir.unique()

In [None]:
# these rows correspond to the anomalous ACTBt0,1,2,3 target names
# (could these be from some kind of timelapse?)
d_raw.loc[(d_raw.exp_id=='ML0084') & (d_raw.well_id=='H12')].sort_values(by='filename')

In [None]:
# check that there's never more than one exp_subdir in an exp_dir
d_raw.groupby(['exp_dir']).nunique().max().exp_subdir

In [None]:
# check that every exp_dir appears in only one plate_dir
exp_dirs = {}
for ind, row in d_raw.iterrows():
    plate_dir = exp_dirs.get(row.exp_dir)
    if plate_dir is None:
        exp_dirs[row.exp_dir] = row.plate_dir
        continue
    if plate_dir != row.plate_dir:
        print(row.exp_dir)

In [None]:
# check the well_ids
d_raw.well_id.unique()

In [None]:
# there should be no missing well_ids
d_raw.loc[d_raw.well_id.isna()]

In [None]:
# count raw FOVs per plate
d_raw.groupby('plate_dir').count().sort_values(by='filename', ascending=False).filename/96

In [None]:
# count raw FOVs per target
d_raw.groupby('target_name').count().sort_values(by='filename', ascending=False)

In [None]:
# distribution of FOV counts per target
_ = plt.hist(d_raw.groupby('target_name').count().filename, bins=np.arange(0, 40))

In [None]:
# total raw data size by plate in gigabytes
(d_raw.groupby('plate_num').filesize.sum().sort_values(ascending=False)/1024/1024/1024).astype(int)

In [None]:
# total data size and total raw data size in terabytes
d.filesize.sum()/1e12, d_raw.filesize.sum()/1e12

In [None]:
_ = plt.hist(d.filesize/1e9, bins=np.arange(0, .6, .01))

### Jin samples and samples that are in the wrong plate directories

In [None]:
# Filenames in 'Jin' format
d.loc[d.target_name=='Jin'].groupby('plate_dir').count()

In [None]:
# a Jin sample from Plate6 that was in a 'manual redo' experiment that appears in Plate7 directory
d_raw.loc[(d_raw.well_id=='E06') & (d_raw.plate_num==7)]

In [None]:
# another Jin sample that is in the wrong plate directory
d_raw.loc[(d_raw.well_id=='C12') & (d_raw.plate_num==3)]

In [None]:
# this well has target_names ACTBt0,1,2,3
# (but it gets dropped in construct_raw_metadata when degenerate fov_ids are eliminated)
d_raw.loc[(d_raw.well_id=='H12') & (d_raw.plate_num==9)]

In [None]:
# this well is target 'ARHGAP11A/B' which is denoted both 'ARHGAP11A' and 'ARHGAP11AB' 
d_raw.loc[(d_raw.well_id=='D01') & (d_raw.plate_num==9)]

In [None]:
# IMPAD1 appears on plate2 and plate6 (in all lists)
d_raw.loc[d_raw.target_name=='IMPAD1']

In [None]:
# the number of plate_num, well_id combinations associated with more than one target
(d_raw.groupby(['plate_num', 'well_id']).nunique().target_name > 1).sum()

In [None]:
# (plate_num, well_id) combinations associated with more than one target_name
# these should all be mistakes of some kind - usually manual redo files in the wrong plate directory
d_raw.groupby(['plate_num', 'well_id']).nunique().sort_values(by='target_name', ascending=False).iloc[:15]

### Designing the `oc-plate-microscopy` directory structure

In [None]:
# plate directory name format
# cell_line_type is 'sorted', 'resorted', or 'clonal'
# data_type is 'proj', 'crop', 'ijclean'
'{cell_line_type}/{data_type}/{progenitor_line}-{plate_design_id}/'

# filename format
'{progenitor_line}-{plate_design_id}-{well_id}-{pml_id}-{site_num}-{target_name}'

# example filepath (for resorted lines, 'sorted' would be 'resorted')
'sorted/proj/czML0383-P0001/czML0383-P0001-A01-PML0001-S01-ATL2'

# proposed filepath for a clonal line (clone 'CL01')
'clonal/proj/czML0383-P0001/czML0383-P0001-A01-CL01-PML0001-S01-ATL2'

In [None]:
# tags for processed files

# raw (uint16) projections
'PROJ-L405'
'PROJ-L488'

# autoscaled (uint8) projections
# ('AS01' stands for `autoscale(im, percentile=1)`)
'PROJ-AS01-L405'

# 2x-downsampled and autogained (8x smaller) 
'PROJ-DS2X-AS01'

# an xy crop (saved as a PNG tile)
'CROP-0000-0424-0600-0600-L405.png'