In [None]:
import os
import re
import sys
import glob
import json
import shutil
import pickle
import hashlib
import skimage
import datetime
import tifffile
import numpy as np
import pandas as pd

import dask
import dask.diagnostics

from matplotlib import pyplot as plt

sys.path.append('..')
from pipeline_process.imaging import image, plate_microscopy_api, utils, viz
from pipeline_process.cli import imaging_tasks

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ESS 'PlateMicroscopy' directory
ess_root = '/Volumes/ml_group/PlateMicroscopy/'
os.path.isdir(ess_root)

In [None]:
# times to hash
# with dask on ess: 1200 rows in 30 seconds
# without dask on ess: 50 rows in 30 seconds

### Instance of a PlateMicroscopy API

In [None]:
api = plate_microscopy_api.PlateMicroscopyAPI(ess_root, '../plate-microscopy-cache/20191025-ess/')
len(api.os_walk), api.md.shape[0], api.md.is_raw.sum(), api.md_raw.shape[0]

### Parsing the metadata text files

As far as I can tell, there's nothing in these text files (which are actually JSON files) that's not also in the IJMetadata and MicroManagerMetadata TIFF tags.

In [None]:
with open(api.src_filepath(d_raw.iloc[0]).replace('.ome.tif', '_metadata.txt'), 'r') as file:
    d = json.load(file)

In [None]:
sorted([(key, val) for key, val in d['FrameKey-0--1-0'].items()])

### Parsing raw TIFF metadata


In [None]:
api.dst_plate_dir(api.md_raw.iloc[0].plate_dir)

In [None]:
# test dst path generation
api.dst_filepath(api.md_raw.iloc[1], kind='metadata')

In [None]:
# test path aggregation
paths = api.aggregate_filepaths('')

In [None]:
# test parsing a raw file
t = api.process_raw_tiff(api.md_raw.iloc[-1], src_root=ess_root, dst_root='/Users/keith.cheveralls/image-data/oc-pm-test')

In [None]:
t.parse_micromanager_metadata()
t.validate_micromanager_metadata()

In [None]:
t.split_channels()

### Observing status of processing on `cap`

In [None]:
dst_root = '/Volumes/ml_group/oc-plate-microscopy/'
os.path.isdir(dst_root)

In [None]:
counts = api.md_raw.groupby('plate_dir').count().sort_values(by='plate_dir', ascending=True).filename

total = 0
for plate_dir in counts.index:
    n_md = 0
    n_proj = 0
    dst_plate_dir = api.dst_plate_dir(plate_dir)
    path = os.path.join(dst_root, 'metadata', dst_plate_dir)
    if os.path.isdir(path):
        n_md = len(glob.glob(os.path.join(path, '*.json')))
        n_err = len(glob.glob(os.path.join(path, '*-events.csv')))
        total += n_md
    
    path = os.path.join(dst_root, 'projections', 'DAPI', 'PROJZ', dst_plate_dir)
    if os.path.isdir(path):
        n_proj = len(glob.glob(os.path.join(path, '*PROJZ.tif')))
        
    print(f'{dst_plate_dir:<20}{n_md:<4} {n_proj:<4} {counts.loc[plate_dir]:<6} ({n_err})')
print('Total: %s' % total)

### Observations of metadata consistencies and anomalies

__Loading tiffs with `tifffile.TiffFile`__<br>
The stand-alone tifffile package (v0.15.1) works to load all raw TIFFs. There are 14754 stacks in 'v1' metadata format and 5243 in 'v2' format.


__Inconsistent number of slices per channel__
- 'P0014_ML0118_E2_1_RPS6KA4_events' page 50 - last page missing tags and the GFP channel is missing completely
- 'P0014_ML0120_H5_12_VRK3_events' page 76 - last page is missing tags and half of the GFP channel is missing
- 'P0018_ML0132_F4_4_GOLT1B_events' page 192 - last page missing tags and uneven number of slices in DAPI and GFP
 
__Inconsistent exposure times__<br>
There are three TIFFs with inconsistent exposure times: 'G5_22_TRIM24.ome.tif', 'G12_13_ANLN.ome.tif', 'F9_9_JAK1.ome.tif'. In all cases, the exposure time from the GFP seems to have been prematurely assigned to some of the DAPI slices. TODO: determine whether this is true for the metadata or the actual acqusitions.  


__Other issues__<br>
- some raw TIFFs have a negative DAPI channel index (indices are -1 and 0 for DAPI and GFP)
- some raw TIFFs have an extra page with no metadata or data
- some raw TIFFs may have extra pages at the end, possibly with valid metadata (according to Nathan)

__Missing metadata in disentangled stacks for Plate16,17,18__<br>
The raw TIFFs from Plates 16,17,18 that were disentangled from 'giant' stacks in the `_compressed` subdirectories using Nathan's `stackDisentangle.py` script all have invalid MM metadata tags. Due to a bug, the MM metadata tag from the first page of the disentangled stack appears on every page. Retrieving the true MM metadata for each page will require re-disentangling the stacks. 

### Load all metadata parsing events

In [None]:
ev = pd.read_csv(os.path.join(dst_root, 'aggregated-processing-events.csv'))

In [None]:
ev = ev.loc[ev.message.apply(lambda m: 'IJMetadata' not in m)]
ev = ev.loc[ev.message.apply(lambda m: 'Inconsistent values' not in m)]

ev.groupby(['plate_dir', 'message']).agg(['count', 'first'])

In [None]:
df = pd.read_csv('%s/aggregated-raw-tiff-metadata.csv' % dst_root)

In [None]:
_ = plt.hist(df.gfp_exposure_time, bins=np.arange(0, 400, 1))

In [None]:
counts, edges = np.histogram(df.gfp_max_intensity, bins=np.arange(0, 65535, 100))
plt.plot(edges[1:], (counts + 1))

In [None]:
plt.scatter(df.gfp_exposure_time, df.gfp_max_intensity, alpha=.1)

In [None]:
(df.gfp_max_intensity==65535).sum()

In [None]:
df.gfp_exposure_time.isna().sum()

### Recapitulate Nathan's stack disentangling script

We see that TiffWriter.save does not actually save the MM metadata tag for each page; instead, it saves the MM tag from the first page with every subsequent page.

In [None]:
t = image.RawPipelineTIFF('/Users/keith.cheveralls/image-data/plate18-ex-compressed/MMStack_31.ome.tif')

In [None]:
t = image.RawPipelineTIFF('/Users/keith.cheveralls/image-data/plate17-ex-compressed/MMStack_0.ome.tif')

In [None]:
t.parse_micromanager_metadata()
t.validate_micromanager_metadata()

In [None]:
plt.plot(t.mm_metadata.slice_ind)

In [None]:
entangled_tiff = tifffile.TiffFile('/Users/keith.cheveralls/image-data/plate17-ex-compressed/MMStack_0.ome.tif')

new_pages = []
new_tags = []
for ind in range(222):
    page = entangled_tiff.pages[ind]
    new_pages.append(page.asarray())
    mm_metadata = json.dumps(page.tags['MicroManagerMetadata'].value)
    mm_tag = ('MicroManagerMetadata', 's', 0, mm_metadata, False)
    new_tags.append(mm_tag)

In [None]:
t_out = tifffile.TiffWriter('/Users/keith.cheveralls/image-data/plate17-ex-compressed/test-disentangle-first-222-pages.tif')
for page, tag in zip(new_pages, new_tags):
    t_out.save(page, extratags=[tag], contiguous=False)
t_out.close()

In [None]:
t = image.RawPipelineTIFF('/Users/keith.cheveralls/image-data/plate17-ex-compressed/test-disentangle-first-222-pages.tif')

In [None]:
t.parse_micromanager_metadata()
t.validate_micromanager_metadata()

### Refactoring nathan's method to select in-focus stacks

Still in development.

In [None]:
# a raw stack
stack = tifffile.imread('/Users/keith.cheveralls/image-data/MMStack_601-E2-1.ome.tif')
dapi_stack = stack[:131, :, :]
stack.shape

In [None]:
dapi_stack.max(axis=1).shape

In [None]:
# a stack from nathan
stack = tifffile.imread('/Users/keith.cheveralls/image-data/A9_1_BAG6.ome.tif')
dapi_stack = stack[:131, :, :]
stack.shape

In [None]:
viz.imshow(dapi_stack[25, :, :])

In [None]:
viz.imshow(dapi_stack.max(axis=2))

In [None]:
# blur_vals = np.array([cv2.Laplacian(zslice, cv2.CV_64F).var() for zslice in dapi_stack])
sum_vals = np.array([zslice.mean() for zslice in dapi_stack]).astype(float)

In [None]:
# suppose one z-slice is underexposed by a factor of two
# sum_vals[30] = sum_vals[30]/2
plt.plot((sum_vals))

In [None]:
# check derivative for spikes due to isolated unexposed z-slices
np.abs(np.diff(sum_vals)).max()

In [None]:
# calculate the mean and variance of the intensity profile in z
sum_vals -= sum_vals.min()
sum_vals /= sum_vals.sum()
x = np.arange(len(sum_vals))
xm = (x * sum_vals).sum()
xv = (x * x * sum_vals).sum()
xs = np.sqrt(xv - xm**2)
xm, xs

In [None]:
xm - 2*xs, xm + 2*xs