In [None]:
import os
import re
import sys
import glob
import json
import shutil
import pickle
import hashlib
import skimage
import datetime
import tifffile
import numpy as np
import pandas as pd

import imageio
import dask
import dask.diagnostics

from matplotlib import pyplot as plt

sys.path.append('..')
from pipeline_process.imaging import image, plate_microscopy_api, utils, viz
from pipeline_process.cli import imaging_tasks

sys.path.append('/Users/keith.cheveralls/projects/dragonfly-automation/')
import dragonfly_automation.utils
from dragonfly_automation.fov_models import PipelineFOVScorer

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ESS 'PlateMicroscopy' directory
ess_root = '/Volumes/ml_group/PlateMicroscopy/'
os.path.isdir(ess_root)

dst_root = '/Volumes/ml_group/oc-plate-microscopy/'
os.path.isdir(dst_root)

### Instance of a PlateMicroscopy API

In [None]:
api = plate_microscopy_api.PlateMicroscopyAPI(ess_root, '../plate-microscopy-cache/20191114-ess/')
len(api.os_walk), api.md.shape[0], api.md.is_raw.sum(), api.md_raw.shape[0]

In [None]:
api.md_raw

### Parsing the metadata text files

As far as I can tell, there's nothing in these text files (which are actually JSON files) that's not also in the IJMetadata and MicroManagerMetadata TIFF tags.

In [None]:
with open(api.src_filepath(d_raw.iloc[0]).replace('.ome.tif', '_metadata.txt'), 'r') as file:
    d = json.load(file)

In [None]:
sorted([(key, val) for key, val in d['FrameKey-0--1-0'].items()])

### Parsing raw TIFF metadata


In [None]:
api.dst_plate_dir(api.md_raw.iloc[0].plate_dir)

In [None]:
# test dst path generation
api.dst_filepath(api.md_raw.iloc[1], kind='metadata')

In [None]:
# test path aggregation
paths = api.aggregate_filepaths('')

In [None]:
# test parsing a raw file
t = api.process_raw_tiff(api.md_raw.iloc[-1], src_root=ess_root, dst_root='/Users/keith.cheveralls/image-data/oc-pm-test')

In [None]:
t.parse_micromanager_metadata()
t.validate_micromanager_metadata()

In [None]:
t.split_channels()

### Observing status of processing on `cap`

In [None]:
dst_root = '/Volumes/ml_group/oc-plate-microscopy/'
os.path.isdir(dst_root)

In [None]:
dst_plate_dirs = set([api.dst_plate_dir(row) for ind, row in api.md_raw.iterrows()])
dst_plate_dirs = sorted(dst_plate_dirs)

In [None]:
counts = api.md_raw.groupby(['plate_num', 'imaging_round_num']).count().sort_values(by='plate_dir', ascending=True).filename

total = 0
for dst_plate_dir in dst_plate_dirs:
    plate_num = int(dst_plate_dir.split('-')[1][1:])
    imaging_num = int(dst_plate_dir.split('-')[-1][1:])
    n_md = 0
    n_proj = 0
    path = os.path.join(dst_root, 'metadata', dst_plate_dir)
    if os.path.isdir(path):
        n_err = len(glob.glob(os.path.join(path, '*-events.csv')))
    
    path = os.path.join(dst_root, 'projections', 'dapi', 'z', dst_plate_dir)
    if os.path.isdir(path):
        n_proj = len(glob.glob(os.path.join(path, '*PROJ-Z.tif')))
        total += n_proj
        
    print(f'{dst_plate_dir:<20} {n_proj:<4} {counts.loc[(plate_num, imaging_num)]:<6} ({n_err})')
print('Total: %s' % total)

### Summary of metadata issues

__Loading tiffs with `tifffile.TiffFile`__<br>
The stand-alone tifffile package (v0.15.1) works to load all raw TIFFs. There are 14754 stacks in 'v1' metadata format and 5243 in 'v2' format.


__Inconsistent number of slices per channel__
- 'P0014_ML0118_E2_1_RPS6KA4_events' page 50 - last page missing tags and the GFP channel is missing completely
- 'P0014_ML0120_H5_12_VRK3_events' page 76 - last page is missing tags and half of the GFP channel is missing
- 'P0018_ML0132_F4_4_GOLT1B_events' page 192 - last page missing tags and uneven number of slices in DAPI and GFP
 
__Inconsistent exposure times__<br>
There are three TIFFs with inconsistent exposure times: 'G5_22_TRIM24.ome.tif', 'G12_13_ANLN.ome.tif', 'F9_9_JAK1.ome.tif'. In all cases, the exposure time from the GFP seems to have been prematurely assigned to some of the DAPI slices. TODO: determine whether this is true for the metadata or the actual acqusitions.  


__Other issues__<br>
- some raw TIFFs have a negative DAPI channel index (indices are -1 and 0 for DAPI and GFP)
- some raw TIFFs have an extra page with no metadata or data
- some raw TIFFs may have extra pages at the end, possibly with valid metadata (according to Nathan)

__Missing metadata in disentangled stacks for Plate16,17,18__<br>
The raw TIFFs from Plates 16,17,18 that were disentangled from 'giant' stacks in the `_compressed` subdirectories using Nathan's `stackDisentangle.py` script all have invalid MM metadata tags. Due to a bug, the MM metadata tag from the first page of the disentangled stack appears on every page. Retrieving the true MM metadata for each page will require re-disentangling the stacks. 

### Load all parsing events and the parsed metadata

In [None]:
ev = pd.read_csv(os.path.join(dst_root, 'aggregated-processing-events.csv'))

In [None]:
ev = ev.loc[ev.message.apply(lambda m: 'IJMetadata' not in m)]
ev = ev.loc[ev.message.apply(lambda m: 'Inconsistent values' not in m)]

ev.groupby(['plate_dir', 'message']).agg(['count', 'first'])

In [None]:
df = pd.read_csv('%s/aggregated-raw-tiff-metadata.csv' % dst_root)

In [None]:
_ = plt.hist(df.gfp_exposure_time, bins=np.arange(0, 400, 1))

In [None]:
counts, edges = np.histogram(df.gfp_max_intensity, bins=np.arange(0, 65535, 100))
plt.plot(edges[1:], (counts + 1))

In [None]:
plt.scatter(df.gfp_exposure_time, df.gfp_max_intensity, alpha=.1)

In [None]:
(df.gfp_max_intensity==65535).sum()

In [None]:
df.gfp_exposure_time.isna().sum()

### Predict scores for all raw FOVs

This is after calculating features from the z-projections of all raw FOVs using the `imaging_tasks` CLI on `cap`. The command for this is copied below.

In [None]:
scorer = PipelineFOVScorer(mode='training', model_type='regression')
scorer.load('/Users/keith.cheveralls/projects/dragonfly-automation/models/2019-10-08/')
scorer.train()

In [None]:
all_md = pd.read_csv('/Volumes/ml_group/oc-plate-microscopy/aggregated-raw-tiff-metadata.csv')

# patch fov_id ('ML0125-C11-4' -> 'ML0125-C11-S04')
all_md['fov_id'] = [
    '-'.join(fov_id.split('-')[:-1] + ['S%02d' % int(fov_id.split('-')[-1])])
    for fov_id in all_md.fov_id]
all_md.shape

In [None]:
len(set(all_md.fov_id))

In [None]:
all_features = pd.read_csv('/Volumes/ml_group/oc-plate-microscopy/aggregated-fov-features.csv')

# patch fov_id ('ML0125-C11-4' -> 'ML0125-C11-S04')
all_features['fov_id'] = [
    '-'.join(fov_id.split('-')[:-1] + ['S%02d' % int(fov_id.split('-')[-1])])
    for fov_id in all_features.fov_id]
all_features.shape

In [None]:
# merge data and metadata
data = pd.merge(api.md_raw.copy(), all_features, left_on='fov_id', right_on='fov_id', how='inner')
data.rename(columns={'filename_x': 'src_filename', 'filename_y': 'dst_filename'}, inplace=True)
data.shape, api.md_raw.shape

In [None]:
data.fov_id

In [None]:
data.groupby('error').count().is_raw

In [None]:
data = data.loc[data.error.isna()]
data.drop(labels='error', axis=1, inplace=True)
data.shape

In [None]:
# force inf to nan because dropna does not drop np.infs
data = data.replace([np.inf, -np.inf], np.nan)

# drop FOVs with missing features
data = data.dropna(axis=0, how='any', subset=scorer.feature_order)
data.shape

In [None]:
# predicted scores for unsorted FOVs
X = data[list(scorer.feature_order)].values
yp = scorer.model.predict(X)
data['yp'] = yp

In [None]:
def construct_filepath(filename, dst_root):
    plate_dir = '-'.join(filename.split('-')[:4])
    filepath = os.path.join(dst_root, plate_dir, filename)
    return filepath

In [None]:
dst_root = '/Users/keith.cheveralls/image-data/oc-plate-microscopy/projections/DAPI/PROJZ'
data['filepath'] = None
for ind, row in data.iterrows():
    data.at[ind, 'filepath'] = construct_filepath(row.dst_filename, dst_root)

In [None]:
@dask.delayed
def calc_hash(filepath):
    sha1 = hashlib.sha1()
    with open(filepath, 'rb') as file:
        sha1.update(file.read())
    hash_value = sha1.hexdigest()
    return hash_value

In [None]:
hashes = [calc_hash(row.filepath) for ind, row in data.iterrows()]
with dask.diagnostics.ProgressBar():
    hashes = dask.compute(*hashes)

In [None]:
len(hashes), len(set(hashes))

In [None]:
# FOVs from a particular plate with a particular score
d = data.loc[(data.plate_num==15) & (data.imaging_round_num==1)].copy()
d = d.sort_values(by='yp', ascending=False)

d.shape, 25*25

In [None]:
pd.DataFrame(data={
    'plate_num': data.groupby('target_name').plate_num.first(), 
    'score': data.groupby('target_name').yp.max(), 
    'count': data.groupby('target_name').yp.count()}).to_csv('/Users/keith.cheveralls/image-data/oc-plate-microscopy-max-scores.csv')

In [None]:
plt.hist(data.groupby('target_name').yp.max().values, bins=20)

In [None]:
# sanity check - manually verify that the target_names for the selected plate 
# match the 'mNG11 HEK Library' google sheet
sorted(d.target_name.unique())

In [None]:
# sanity check - manually verify the ML experiment IDs are correct
sorted(d.exp_id.unique())

In [None]:
bad = (d.yp < -.5)
neutral = (d.yp > -.5) & (d.yp < .5)
good = d.yp > .5
alll = bad | neutral | good

tile = viz.build_tile(
    d.loc[alll], 
    shape=(30, 30), 
    figsize=25, 
    offset=0,
    show_labels=True, 
    label_column='yp', 
    label_format='%0.2f')

In [None]:
tifffile.imsave('/Users/keith.cheveralls/image-data/all-raw-plate7-ordered-by-score-30x30.tif', tile)

In [None]:
# make tiles for all plates
for plate_id in data.plate_id.unique():
    print(plate_id)
    d = data.loc[data.plate_id==plate_id].copy()
    d = d.sort_values(by='yp', ascending=False)
    tile = viz.build_tile(
        d, 
        shape=(30, 30), 
        figsize=25, 
        offset=0,
        plot=False)
    tifffile.imsave('/Users/keith.cheveralls/image-data/FOV-tile-all-raw-plate%s-30x30.tif' % plate_id, tile)

In [None]:
plate_ids = [(n, 1) for n in range(1, 21)] + [(n, 2) for n in range(1, 6)]

bin_width = 0.2
plot_neg = True
if plot_neg:
    bin_min = -1.0
    ymax = 4
else:
    bin_min = -.8
    ymax = 1.5

n = 0
fig, axs = plt.subplots(5, 5, figsize=(16, 12))
for rind, row in enumerate(axs):
    for cind, ax in enumerate(row):
        plate_num, r_num = plate_ids[n]
        n += 1
        if plate_num > 19:
            continue
            
        values = data.loc[(data.plate_num==plate_num) & (data.imaging_round_num==r_num)].yp.values
        ax.hist(values, bins=np.arange(bin_min, 1 + bin_width, bin_width), density=True)
        ax.set_title('Plate %s-%s (n = %d)' % (plate_num, r_num, len(values)))    

        ax.set_ylim([0, ymax])
        ax.set_xticks([-1, -.5, 0, .5, 1])
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        if cind==0:
            ax.set_ylabel('Density')
            ax.set_yticklabels([0, 1, 2, 3])
        if rind==len(axs)-1:
            ax.set_xlabel('Score')
            ax.set_xticklabels([-1, -.5, 0, .5, 1])

In [None]:
pbad, pgood = [], []
for plate_id in plate_ids:
    plate_num, r_num = plate_id
    values = data.loc[(data.plate_num==plate_num) & (data.imaging_round_num==r_num)].yp.values
    pgood.append((values > .7).sum() / len(values))
    pbad.append((values < -.7).sum() / len(values))

In [None]:
fig, ax = plt.subplots(figsize=(12, 3))
width = 1/3

x = np.arange(len(plate_ids))
rects1 = ax.bar(x - width/2, np.array(pgood)*100, width, label='Predicted good')
rects2 = ax.bar(x + width/2, np.array(pbad)*100, width, label='Predicted bad')

ax.set_ylabel('Percent')
ax.set_title('')
ax.set_xticks(x)
ax.set_xticklabels(map(str, plate_ids))
ax.legend()

In [None]:
data.iloc[0]

In [None]:
# max score by target_name
pbad, pgood = [], []
for plate_id in plate_ids:
    plate_num, r_num = plate_id
    values = data.loc[(data.plate_num==plate_num)].groupby('target_name').max().yp.values
    pgood.append((values > .5).sum() / len(values))
    pbad.append((values < -.5).sum() / len(values))

In [None]:
fig, ax = plt.subplots(figsize=(12, 3))
width = 1/3

x = np.arange(len(plate_ids))
rects1 = ax.bar(x - width/2, np.array(pgood)*100, width, label='Predicted good')
rects2 = ax.bar(x + width/2, np.array(pbad)*100, width, label='Predicted bad')

ax.set_ylabel('Percent')
ax.set_title('')
ax.set_xticks(x)
ax.set_xticklabels(['%s-%s' % plate_id for plate_id in plate_ids])
ax.legend()

In [None]:
list(zip(plate_ids, pbad))

### Recapitulate Nathan's stack disentangling script

We see that TiffWriter.save does not actually save the MM metadata tag for each page; instead, it saves the MM tag from the first page with every subsequent page.

In [None]:
t = image.RawPipelineTIFF('/Users/keith.cheveralls/image-data/plate18-ex-compressed/MMStack_31.ome.tif')

In [None]:
t = image.RawPipelineTIFF('/Users/keith.cheveralls/image-data/plate17-ex-compressed/MMStack_0.ome.tif')

In [None]:
t.parse_micromanager_metadata()
t.validate_micromanager_metadata()

In [None]:
plt.plot(t.mm_metadata.slice_ind)

In [None]:
entangled_tiff = tifffile.TiffFile('/Users/keith.cheveralls/image-data/plate17-ex-compressed/MMStack_0.ome.tif')

new_pages = []
new_tags = []
for ind in range(222):
    page = entangled_tiff.pages[ind]
    new_pages.append(page.asarray())
    mm_metadata = json.dumps(page.tags['MicroManagerMetadata'].value)
    mm_tag = ('MicroManagerMetadata', 's', 0, mm_metadata, False)
    new_tags.append(mm_tag)

In [None]:
t_out = tifffile.TiffWriter('/Users/keith.cheveralls/image-data/plate17-ex-compressed/test-disentangle-first-222-pages.tif')
for page, tag in zip(new_pages, new_tags):
    t_out.save(page, extratags=[tag], contiguous=False)
t_out.close()

In [None]:
t = image.RawPipelineTIFF('/Users/keith.cheveralls/image-data/plate17-ex-compressed/test-disentangle-first-222-pages.tif')

In [None]:
t.parse_micromanager_metadata()
t.validate_micromanager_metadata()

### Refactoring nathan's method to select in-focus stacks

Still in development.

In [None]:
# a raw stack
stack = tifffile.imread('/Users/keith.cheveralls/image-data/MMStack_601-E2-1.ome.tif')
dapi_stack = stack[:131, :, :]
stack.shape

In [None]:
dapi_stack.max(axis=1).shape

In [None]:
# a stack from nathan
stack = tifffile.imread('/Users/keith.cheveralls/image-data/A9_1_BAG6.ome.tif')
dapi_stack = stack[:131, :, :]
stack.shape

In [None]:
viz.imshow(dapi_stack[25, :, :])

In [None]:
viz.imshow(dapi_stack.max(axis=2))

In [None]:
# blur_vals = np.array([cv2.Laplacian(zslice, cv2.CV_64F).var() for zslice in dapi_stack])
sum_vals = np.array([zslice.mean() for zslice in dapi_stack]).astype(float)

In [None]:
# suppose one z-slice is underexposed by a factor of two
# sum_vals[30] = sum_vals[30]/2
plt.plot((sum_vals))

In [None]:
# check derivative for spikes due to isolated unexposed z-slices
np.abs(np.diff(sum_vals)).max()

In [None]:
# calculate the mean and variance of the intensity profile in z
sum_vals -= sum_vals.min()
sum_vals /= sum_vals.sum()
x = np.arange(len(sum_vals))
xm = (x * sum_vals).sum()
xv = (x * x * sum_vals).sum()
xs = np.sqrt(xv - xm**2)
xm, xs

In [None]:
xm - 2*xs, xm + 2*xs

### Nathan's QC CSVs

In [None]:
qc14 = pd.concat([
    pd.read_csv(f, header=None, names=['target', 'flag']) 
    for f in glob.glob('/Users/keith.cheveralls/Box/AutomatedImageQC/QCFiles/Plate14/*.csv')])

qc15 = pd.concat([
    pd.read_csv(f, header=None, names=['target', 'flag']) 
    for f in glob.glob('/Users/keith.cheveralls/Box/AutomatedImageQC/QCFiles/Plate15/*.csv')])

In [None]:
qc14.loc[qc14.flag=='GoodStack'].shape, qc15.loc[qc15.flag=='GoodStack'].shape

In [None]:
qc14.shape, qc15.shape