In [None]:
import os
import re
import sys
import glob
import json
import dask
import shutil
import pickle
import hashlib
import skimage
import datetime
import tifffile
import numpy as np
import pandas as pd

import dask.diagnostics
from matplotlib import pyplot as plt

sys.path.append('..')
from pipeline_process.imaging import image, plate_microscopy_api, utils, viz

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ESS 'PlateMicroscopy' directory
ess_root = '/Volumes/ml_group/PlateMicroscopy/'
os.path.isdir(ess_root)

In [None]:
# times to hash
# with dask on ess: 1200 rows in 30 seconds
# without dask on ess: 50 rows in 30 seconds

### Instance of a PlateMicroscopy API

In [None]:
api = plate_microscopy_api.PlateMicroscopyAPI(ess_root, '../plate-microscopy-cache/20191025-ess/')
len(api.os_walk), api.md.shape, api.md.is_raw.sum(), api.md_raw.shape

### Parsing the metadata text files

As far as I can tell, there's nothing in these text files (which are actually JSON files) that's not also in the IJMetadata and MicroManagerMetadata TIFF tags.

In [None]:
with open(api.src_filepath(d_raw.iloc[0]).replace('.ome.tif', '_metadata.txt'), 'r') as file:
    d = json.load(file)

In [None]:
sorted([(key, val) for key, val in d['FrameKey-0--1-0'].items()])

### Parsing raw TIFF metadata


In [None]:
# test parsing a raw file
api.parse_raw_tiff_metadata(api.md_raw.iloc[0], src_root=ess_root, dst_root='/Users/keith.cheveralls/image-data/oc-pm-test')

In [None]:
d = image.RawPipelineImage('/Users/keith.cheveralls/image-data/H1_1_RABGGTB.ome.tif')
d.parse_micromanager_metadata()

In [None]:
# from plate1
d = image.RawPipelineImage('/Users/keith.cheveralls/image-data/A1_1_ATL2.ome.tif')
d.parse_micromanager_metadata()

In [None]:
# from plate1 thawed
d = image.RawPipelineImage('/Users/keith.cheveralls/image-data/E7_9_RAB14.ome.tif')
d.parse_micromanager_metadata()
d.validate_mm_metadata()

In [None]:
# problematic file from plate14 with an extra and tag-less page
d = image.RawPipelineImage('/Users/keith.cheveralls/image-data/A1_1_CTRL1.ome.tif')
d.parse_micromanager_metadata()

In [None]:
d.validate_mm_metadata()

### Observing status of processing on `cap`

In [None]:
projections_root = '/Volumes/ml_group/PlateMicroscopy-metadata-2019-10-30/'
os.path.isdir(projections_root)

In [None]:
counts = api.md_raw.groupby('plate_dir').count().sort_values(by='plate_dir', ascending=True).filename

total = 0
for plate_dir in counts.index:
    n = 0
    if os.path.isdir(os.path.join(projections_root, plate_dir)):
        n = len(glob.glob(os.path.join(projections_root, plate_dir, '*.json')))
        n_err = len(glob.glob(os.path.join(projections_root, plate_dir, '*_events.csv')))
        total += n
        print(f'{plate_dir:<20}{n}/{counts.loc[plate_dir]} ({n_err})')
print('Total: %s' % total)

### Observations of metadata consistencies and anomalies

__Loading tiffs with `tifffile.TiffFile`__<br>
The stand-alone tifffile package works to load all raw TIFFs. There are 14754 stacks in 'v1' metadata format and 5243 in 'v2' format.


__Inconsistent number of slices per channel__
- 'P0014_ML0118_E2_1_RPS6KA4_events' page 50 - last page missing tags and the GFP channel is missing completely
- 'P0014_ML0120_H5_12_VRK3_events' page 76 - last page is missing tags and half of the GFP channel is missing
- 'P0018_ML0132_F4_4_GOLT1B_events' page 192 - last page missing tags and uneven number of slices in DAPI and GFP
 
__Inconsistent exposure times__<br>
There are three TIFFs with inconsistent exposure times: 'G5_22_TRIM24.ome.tif', 'G12_13_ANLN.ome.tif', 'F9_9_JAK1.ome.tif'. In all cases, the exposure time from the GFP seems to have been prematurely assigned to some of the DAPI slices. TODO: determine whether this is true for the metadata or the actual acqusitions.  


__Other issues__<br>
- some raw TIFFs have a negative DAPI channel index (indices are -1 and 0 for DAPI and GFP)
- some raw TIFFs have extra pages at the beginning with no metadata
- some raw TIFFs may have extra pages at the end, possibly with valid metadata (according to Nathan)

__disentangled stacks for Plate16,17,18__<br>
At least one raw TIFF from Plate18 that was disentangled has only one channel_ind in the MMmetadata and a constant (rather that contiguously incrementing) slice_ind. Turns out that the metadata in the 'entangled' (truly raw) stacks is also missing; the channel index is always -1 and the other columns always correspond to the 405 channel. However, at least some raw (entangled) stacks seem to have valid slice_inds. 


### Load all metadata parsing events

In [None]:
dfs = []
for path in paths:
    df = pd.read_csv(path)
    df['filename'] = path.split(os.sep)[-1]
    df['plate_dir'] = path.split(os.sep)[-2]
    dfs.append(df)

In [None]:
ev = pd.concat(dfs, axis=0)
ev.shape

In [None]:
ev.to_csv('2019-10-31-raw-metadata-parsing-events.csv')

In [None]:
ev = ev.loc[ev.message.apply(lambda m: 'IJMetadata' not in m)]
ev.groupby(['plate_dir', 'message']).count()

In [None]:
# debuggin: parse an example tiff
src_path = api.src_filepath(api.md_raw.loc[api.md_raw.filename=='A7_1_SRP72.ome.tif'].iloc[0])
t = image.RawPipelineImage(src_path)

In [None]:
t.parse_micromanager_metadata()
t.validate_micromanager_metadata()

In [None]:
# test path aggregation
paths = api.aggregate_filepaths('')

### Refactoring nathan's method to select in-focus stacks

Still in development.

In [None]:
# a raw stack
stack = tifffile.imread('/Users/keith.cheveralls/image-data/MMStack_601-E2-1.ome.tif')
dapi_stack = stack[:131, :, :]
stack.shape

In [None]:
dapi_stack.max(axis=1).shape

In [None]:
# a stack from nathan
stack = tifffile.imread('/Users/keith.cheveralls/image-data/A9_1_BAG6.ome.tif')
dapi_stack = stack[:131, :, :]
stack.shape

In [None]:
viz.imshow(dapi_stack[25, :, :])

In [None]:
viz.imshow(dapi_stack.max(axis=2))

In [None]:
# blur_vals = np.array([cv2.Laplacian(zslice, cv2.CV_64F).var() for zslice in dapi_stack])
sum_vals = np.array([zslice.mean() for zslice in dapi_stack]).astype(float)

In [None]:
# suppose one z-slice is underexposed by a factor of two
# sum_vals[30] = sum_vals[30]/2
plt.plot((sum_vals))

In [None]:
# check derivative for spikes due to isolated unexposed z-slices
np.abs(np.diff(sum_vals)).max()

In [None]:
# calculate the mean and variance of the intensity profile in z
sum_vals -= sum_vals.min()
sum_vals /= sum_vals.sum()
x = np.arange(len(sum_vals))
xm = (x * sum_vals).sum()
xv = (x * x * sum_vals).sum()
xs = np.sqrt(xv - xm**2)
xm, xs

In [None]:
xm - 2*xs, xm + 2*xs