In [None]:
import os
import re
import sys
import glob
import json
import shutil
import pickle
import hashlib
import skimage
import datetime
import tifffile
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
sys.path.append('../')
from pipeline_process.imaging import plate_microscopy_api, utils, viz

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Flexo 'PlateMicroscopy' directory
flexo_root = '/Volumes/MicroscopyData/ML_group/Plate_Microscopy/'
os.path.isdir(flexo_root)

In [None]:
# ESS 'PlateMicroscopy' directory
ess_root = '/Volumes/ml_group/PlateMicroscopy/'
os.path.isdir(flexo_root)

### `os.walk` the 'PlateMicroscopy' directory 

In [None]:
api = plate_microscopy_api.PlateMicroscopyAPI(
    '../cache/20191025-ess', partition='ess')
api.cache_os_walk()

In [None]:
ess_api = plate_microscopy_api.PlateMicroscopyAPI(ess_root, '../cache/20191025-ess/')

In [None]:
len(ess_api.os_walk), ess_api.md.shape

### Construct metadata dataframe from the os.walk results

In [None]:
ess_api.construct_metadata()
ess_api.md.shape

In [None]:
ess_api.append_file_info()

In [None]:
ess_api.cache_metadata(overwrite=False)

### Basic sanity checks

In [None]:
d = ess_api.md.copy()

In [None]:
# number of targets
d.target_name.unique().shape, 19 * (96-4)

In [None]:
# unique plate numbers
sorted(map(int, d.plate_num.unique()))

In [None]:
# unparsable filenames
d.loc[d.target_name==''].groupby('plate_dir').count()

In [None]:
# Filenames in 'Jin' format
d.loc[d.target_name=='Jin'].groupby('plate_dir').first()

In [None]:
# count exp_subdirs in each exp_dir (requires paths_only=True)
d.groupby('exp_dir').count().sort_values(by='exp_subdir', ascending=False)

In [None]:
# paths without an exp_subdir (requires paths_only=True)
d.loc[(d.exp_subdir.apply(len) == 0)]

In [None]:
# there's one 'temp' exp_dir - in thawed plate3
d.loc[(d.exp_dir=='temp')]

In [None]:
# example of logical indexing that uses startswith
d.loc[(d.plate_dir=='mNG96wp5') & (d.exp_dir.apply(lambda s: s.startswith('Pub')))]

In [None]:
# unusual experiment directory names (not beginning with ML0, Publication, or Updated)
d.loc[d.exp_dir.apply(lambda s: re.match('^ML0|Publication|Updated', s) is None)].copy()

### Raw data

In [None]:
d_raw = ess_api.raw_metadata()
d_raw.shape

In [None]:
# unique exp_ids
sorted(d_raw.exp_dir.unique())

In [None]:
# unique exp subdirs
sorted(d_raw.exp_subdir.unique())

In [None]:
# construct unique file_id by concatenating the exp_id and the filename
file_ids = ['P%04d_%s_%s' % (row.plate_num, row.exp_dir, row.filename) for ind, row in d_raw.iterrows()]

In [None]:
# check that this file_id is unique
len(file_ids), len(set(file_ids))

In [None]:
# check that there's never more than one exp_subdir in an exp_dir
d_raw.groupby(['exp_dir']).nunique().max()

In [None]:
# check that every exp_dir appears in only one plate_dir
exp_dirs = {}
for ind, row in d_raw.iterrows():
    plate_dir = exp_dirs.get(row.exp_dir)
    if plate_dir is None:
        exp_dirs[row.exp_dir] = row.plate_dir
        continue
    if plate_dir != row.plate_dir:
        print(row.exp_dir)

In [None]:
# count raw FOVs per plate
d_raw.groupby('plate_dir').count().sort_values(by='plate_dir', ascending=False)

In [None]:
for plate_dir in d_raw.plate_dir.unique():
    if os.path.isdir('/Volumes/ml_group/PlateMicroscopy-projections/%s' % plate_dir):
        n = len(glob.glob('/Volumes/ml_group/PlateMicroscopy-projections/%s/*.tif' % (plate_dir)))
        print('%s: %s' % (plate_dir, n/6))

In [None]:
# count raw FOVs per target
d_raw.groupby('target_name').count().sort_values(by='filename', ascending=False)

In [None]:
# distribution of FOV counts per target
_ = plt.hist(d_raw.groupby('target_name').count().filename, bins=np.arange(0, 40))

In [None]:
# total raw data size by plate in gigabytes
(d_raw.groupby('plate_num').filesize.sum().sort_values(ascending=False)/1024/1024/1024).astype(int)

In [None]:
# total data size and total raw data size in terabytes
d.filesize.sum()/1e12, d_raw.filesize.sum()/1e12

In [None]:
_ = plt.hist(d.filesize/1e9, bins=np.arange(0, .6, .01))

### Generating projections

In [None]:
# example tiff that yields tifffile warnings
'mNG96wp1 H1_1_RABGGTB.ome.tif'

# example of TIFF that tifffile can't load (but skimage.external.tifffile can)
'mNG96wp1_Thawed' 'E7_9_RAB14.ome.tif'

In [None]:
row = d_raw.loc[d_raw.filename=='E7_9_RAB14.ome.tif'].iloc[0]

In [None]:
shape = ess_api.make_projections(row, ess_api.root_dir, '/Users/keith.cheveralls/image-data/PM-test/')

In [None]:
errors = []
for ind, row in d_raw.iterrows():
    print('%s %s' % (row.plate_dir, row.filename))
        
    try:
        shape = ess_api.make_projections(row, ess_api.root_dir, '/gpfsML/ML_group/PlateMicroscopy-projections/')
    except Exception as error:
        print('---------- Error ----------\n%s' % row)
        print(error)
        errors.append({'error': str(error), 'row': row})

### ML experiment metadata

In [None]:
exp_md = pd.read_csv('/Users/keith.cheveralls/Downloads/Microscopy-Master-Key.csv')
exp_md = exp_md.rename(columns={c: c.replace(' ', '_').lower() for c in exp_md.columns})
exp_md = exp_md.drop(labels=[c for c in exp_md.columns if c.startswith('unnamed')], axis=1)

In [None]:
exp_md.loc[exp_md.id.isin(d_raw.exp_dir)]

In [None]:
# check that all exp_ids appear in the exp metadata
set(d_raw.exp_dir).difference(exp_md.id)

### Raw TIFF metadata

In [None]:
filepath = '/Users/keith.cheveralls/image-data/MMStack_14-B9-14.ome.tif'

In [None]:
filepath = os.path.join(
    flexo_root, 'mNG96wp19/PublicationQuality/p19E12_3_AP2S1_PyProcessed_IJClean.tif')

In [None]:
stack = tifffile.TiffFile(filepath)

In [None]:
# all metadata tags
stack.pages[0].tags

In [None]:
# this tag should be appended to any processed stacks
stack.pages[0].tags['MicroManagerMetadata'].value

In [None]:
# calculate SHA1 hash
sha1 = hashlib.sha1()
with open(filepath, 'rb') as file:
    sha1.update(file.read())
sha1.hexdigest()

### Refactoring nathan's method to select in-focus stacks

In [None]:
# a raw stack
stack = tifffile.imread('/Users/keith.cheveralls/image-data/MMStack_601-E2-1.ome.tif')
dapi_stack = stack[:131, :, :]
stack.shape

In [None]:
dapi_stack.max(axis=1).shape

In [None]:
# a stack from nathan
stack = tifffile.imread('/Users/keith.cheveralls/image-data/A9_1_BAG6.ome.tif')
dapi_stack = stack[:131, :, :]
stack.shape

In [None]:
viz.imshow(dapi_stack[25, :, :])

In [None]:
viz.imshow(dapi_stack.max(axis=2))

In [None]:
# blur_vals = np.array([cv2.Laplacian(zslice, cv2.CV_64F).var() for zslice in dapi_stack])
sum_vals = np.array([zslice.mean() for zslice in dapi_stack]).astype(float)

In [None]:
# suppose one z-slice is underexposed by a factor of two
# sum_vals[30] = sum_vals[30]/2
plt.plot((sum_vals))

In [None]:
# check derivative for spikes due to isolated unexposed z-slices
np.abs(np.diff(sum_vals)).max()

In [None]:
# calculate the mean and variance of the intensity profile in z
sum_vals -= sum_vals.min()
sum_vals /= sum_vals.sum()
x = np.arange(len(sum_vals))
xm = (x * sum_vals).sum()
xv = (x * x * sum_vals).sum()
xs = np.sqrt(xv - xm**2)
xm, xs

In [None]:
xm - 2*xs, xm + 2*xs

### Designing plate directory schema

In [None]:
# plate directory naming scheme
'{master_line}-{plate_design_id}-{plate_instance}-R{imaging_round}'

# plate1, first electroporation, first time imaged
'mNG-P0001-01-R01'

# thawed plate imaged again has a different imaging round
'mNG-P0001-01-R02'

# a re-sorted plate or a second electroporation, first time imaged
'mNG-P0001-02-R01'

# clones from plate1
'mNG-P0001-01-CLONES-R01'

In [None]:
# Raw stack (as a row in the metadata dataframe)
'mNG96wp19, ML0137_20190528, mNG96wp19_sortday1, A9_1_BAG6.ome.tif'

# Rename stack `{exp_id}_{plate_id}-{well_id}-{fov_num}-{target_name}.tif`
'ML0137-20190528_mNG-P0019-01-R01_A09-01-BAG6.tif'

# Make raw (uint16) projections (for FOV scoring)
'ML0137-20190528_P0019-A09-01-BAG6_DAPI-PROJX.tif'
'ML0137-20190528_P0019-A09-01-BAG6_DAPI-PROJY.tif'
'ML0137-20190528_P0019-A09-01-BAG6_DAPI-PROJZ.tif'
'ML0137-20190528_P0019-A09-01-BAG6_GFP-PROJX.tif'
'ML0137-20190528_P0019-A09-01-BAG6_GFP-PROJY.tif'
'ML0137-20190528_P0019-A09-01-BAG6_GFP-PROJZ.tif'

# Make autoscaled (uint8) projections (for convenient manual inspection)
# need to pick a percentile threshold ('AS99' stands for `autoscale(im, percentile=99)`)
'ML0137-20190528_P0019-A09-01-BAG6_GFP-PROJX-AS99-UINT8.tif'
'...'


# --- FOV scoring and selection --- #
# (using a trained regression model)

# crop in z around the cell layer
'ML0137-20190528_P0019-A09-01-BAG6_CROPZ.tif'

# Make 2x-downsampled and autoscaled (uint8) stacks 
# (for convenient manual inspection; these will be 8x smaller) 
'ML0137-20190528_P0019-A09-01-BAG6_CROPZ-2XDS-AS99-UINT8.tif'


# --- 600x600 ROI scoring and selection (for the website) --- #
# (score 'all' possible ROIs created from the z-projections)
# (requires somehow training an ROI regression model)

# Crop and autoscale the best-scoring ROIs 
'ML0137-20190528_P0019-A09-01-BAG6_CROPZ-CROPXY-01-AS99-UINT8.tif'
'ML0137-20190528_P0019-A09-01-BAG6_CROPZ-CROPXY-02-AS99-UINT8.tif'
'...'

# Create tiled PNGs from the cropped stacks
'ML0137-20190528_P0019-A09-01-BAG6_CROPZ-CROPXY-01-AS99-UINT8-DAPI-TILE.png'
'ML0137-20190528_P0019-A09-01-BAG6_CROPZ-CROPXY-01-AS99-UINT8-GFP-TILE.png'

In [None]:
# directory structure
# organize stacks into plate and experiment subdirectories to make moving/copying the data easier
# do not organize projections into subdirectories - to make visual inspection easier

# PlateMicroscopy
# |
# |--raw-stacks
# |----mNG-P0001-01
# |------ML0001-20190101
# |
# |--processed-stacks
# |----mNG-P0001-01
# |------ML0001-20190101
# |
# |--projections
# |----raw-z-projections
# |----raw-xy-projections
# |----uint8-z-projections
# |----uint8-xy-projections

# |------ML0001-20190101

In [None]:
# Future considerations - what could change and what other dimensions we should anticipate
#
# - thawing and imaging the same plate multiple times -> a plate directory for each time - 'P0001-01', 'P0001-02'
# - electroporating the same plate design/instance multiple times -> 
# - electroporating the same plate with different parental cell lines -> a 
# - 