In [None]:
import os
import re
import git
import sys
import glob
import json
import joblib
import sklearn
import skimage
import tifffile
import imageio
import numpy as np
import pandas as pd
import seaborn as sns

import dask
import dask.diagnostics

import sklearn.cluster
import sklearn.ensemble
import sklearn.model_selection

from scipy import ndimage
from skimage import feature
from skimage import morphology

import matplotlib
from matplotlib import pyplot as plt
from matplotlib import colors as mplcolors

In [None]:
def printr(s):
    sys.stdout.write('\r%s' % s)

In [None]:
sys.path.append('/Users/keith.cheveralls/projects/opencell-process/')
from pipeline_process.imaging import utils, viz

sys.path.append('/Users/keith.cheveralls/projects/dragonfly-automation/')
import dragonfly_automation.utils
from dragonfly_automation.fov_models import PipelineFOVScorer

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
plt.rcParams["patch.force_edgecolor"] = False
sns.set_style("whitegrid", {'axes.grid' : False})

### Directories of manually-sorted FOVs 

In [None]:
training_dirpaths = [
    '/Users/keith.cheveralls/image-data/dragonfly-automation-tests/20190910-snaps-sorted/',
    '/Users/keith.cheveralls/image-data/dragonfly-automation-tests/20191003-snaps-sorted/',
    '/Users/keith.cheveralls/image-data/PlateMicroscopyProjections-uint8-all-DAPI-sorted/'
]

### Dataframes of labels from the subdirectory location of each FOV

These must be re-generated whenever the sorting has been updated or edited. 

In [None]:
# generate label dataframes
for dirpath in training_dirpaths:

    dfs = []
    for label in ['good', 'bad', 'neutral', 'unsorted']:
        filepaths = glob.glob('%s/%s/*.tif' % (dirpath, label))
        filenames = [path.split(os.sep)[-1] for path in filepaths]

        df = pd.DataFrame(
            data={
                'filepath': filepaths, 
                'filename': filenames, 
                'subdirectory': [label]*len(filepaths)
            }
        )
        dfs.append(df)

    d = pd.concat(dfs, axis=0).reset_index()[['filepath', 'filename', 'subdirectory']]
    # d.to_csv('%s/2019-10-07-labels.csv' % dirpath, index=False)

### Calculate features for all FOVs in each directory

In [None]:
# classifier instance to access feature calculation methods
fc = PipelineFOVScorer(mode='training')

In [None]:
# calculate features for a single training directory
# (note: this and the cell below were run by hand for each training_dirpath)
dirpath = training_dirpaths[1]

filepaths = []
for label in ['good', 'bad', 'neutral', 'unsorted']:
    filepaths.extend(glob.glob('%s/%s/*.tif' % (dirpath, label)))
len(filepaths)

In [None]:
data = []
for ind, filepath in enumerate(filepaths):
    printr('%s/%s' % (ind, len(filepaths)))
    data.append(fc.process_existing_fov(filepath))
# pd.DataFrame(data=data).to_csv('%s/2019-10-07-features.csv' % dirpath, index=False)

In [None]:
# try using dask instead (about 10 FOVs/sec instead of 3)
tasks = [dask.delayed(fc.process_existing_fov)(filepath) for filepath in filepaths]
with dask.diagnostics.ProgressBar():
    data = dask.compute(*tasks)

### Calculate features for 'thawed' plates

This is just to predict scores and assess FOV quality.

In [None]:
root = '/Users/keith.cheveralls/image-data/PlateMicroscopyProjections-uint16-thawed/'
thawed_dirpaths = glob.glob('%s/*' % root)
(thawed_dirpaths)

In [None]:
for dirpath in thawed_dirpaths:
    print(dirpath)
    filepaths = glob.glob('%s/*.tif' % (dirpath))
    filepaths = [name for name in filepaths if '_DAPI_' in name]
    data = []
    for filepath in filepaths:
        data.append(fc.process_existing_fov(filepath))
    # pd.DataFrame(data=data).to_csv('%s/2019-10-08-features.csv' % dirpath, index=False)

### Merge features and labels and concatenate

In [None]:
fc = PipelineFOVScorer(mode='training', model_type='regression')

In [None]:
def load_and_merge(dirpath, labels_timestamp, features_timestamp):
    labels = pd.read_csv('%s/%s-labels.csv' % (dirpath, labels_timestamp))
    features = pd.read_csv('%s/%s-features.csv' % (dirpath, features_timestamp))
    data = pd.merge(labels, features, left_on='filename', right_on='filename')
    print((labels.shape, features.shape, data.shape))
    return data

In [None]:
dfs = []
for dirpath in training_dirpaths[:]:
    dfs.append(load_and_merge(dirpath, '2019-10-07', '2019-10-07'))
data = pd.concat(dfs, axis=0, sort=True)
data.shape

In [None]:
# force inf to nan because dropna does not drop np.infs
data = data.replace([np.inf, -np.inf], np.nan)

# drop FOVs with missing features
data = data.dropna(axis=0, how='any', subset=fc.feature_order)
data.shape

In [None]:
# drop unsorted FOVs
all_data = data.copy()
data_unsorted = data.loc[data.subdirectory == 'unsorted']
data = data.loc[data.subdirectory != 'unsorted']
data.shape

In [None]:
# create scores
data.replace(to_replace={
    'good': 1,
    'neutral': 0,
    'bad': -1,
}, inplace=True)

data.rename(columns={'subdirectory': 'score'}, inplace=True)

In [None]:
# count labels
data.groupby('score').num_nuclei.count()

In [None]:
# basic EDA - histograms of one feature by score
feature = 'num_unclustered'
bins = np.arange(0, 40, 2)
for score in [-1, 0, 1]:
    plt.hist(data.loc[data.score==score][feature], bins=bins, density=True, alpha=.5)

In [None]:
X = data[list(fc.feature_order)].values
y = data['score'].values.astype(float)
X.shape

In [None]:
# train a standalone random forest regressor
model = sklearn.ensemble.RandomForestRegressor(
    n_estimators=300,
    max_features='auto',
    oob_score=True)

model.fit(X, y)
model.oob_score_

### Train a regression model

In [None]:
fc = PipelineFOVScorer(mode='training', model_type='regression')

In [None]:
fc.training_data = data.copy()

In [None]:
fc.train()

In [None]:
fc.current_training_metadata

In [None]:
# fc.save('../models/2019-10-08')

In [None]:
fc.load('../models/2019-10-08/')
fc.train()
fc.validate()

In [None]:
data['yp'] = fc.model.oob_prediction_

### Plot predicted scores and tile FOVs by score

In [None]:
# the distribution of oob-predicted scores by actual score
# plt.figure(figsize=(12, 6))
yp = fc.model.oob_prediction_
labels = {-1: 'Bad', 0: 'Mediocre', 1: 'Good'}
for label in [-1, 0, 1]:
    plt.hist(yp[(yp > -1) & (y==label)], bins=15, density=True, alpha=.5, edgecolor=None, label=labels[label])
    plt.legend()
    
# plt.savefig('/Users/keith.cheveralls/Box/KCC-slides/2019-10-23_lab-meeting/fov-score-distribution.pdf')

In [None]:
# annotated FOVs for a given actual score, ordered/thresholded by predicted score
# (e.g., all annotated 'bad' FOVs with a positive predicted score)
d = data.sort_values(by='yp', ascending=False)
tile = viz.build_tile(
    d.loc[(d.yp < 0) & (d.score==1)], 
    shape=(10, 5), 
    figsize=16, 
    offset=0,
    show_labels=True, label_column='yp', label_format='%0.2f')

In [None]:
# imageio.imsave('/Users/keith.cheveralls/Box/KCC-slides/2019-10-23_lab-meeting/FOV-tile_actual-score-good_pred-score-le-zero.png', tile)

In [None]:
# one-dimension array of FOVs from bad to good (for lab meeting)
d = data.sort_values(by='yp', ascending=False)
tile = viz.build_tile(
    d.iloc[3::200],#.sort_values(by='yp'), 
    shape=(1, 16), 
    figsize=16, 
    offset=0,
    show_labels=False, label_column='yp', label_format='%0.2f')

In [None]:
# tifffile.imsave('/Users/keith.cheveralls/Box/KCC-slides/2019-10-23_lab-meeting/FOVs-bad-to-good.tif', tile)

### Predict scores for unsorted FOVs

In [None]:
# predicted scores for unsorted FOVs
X = data_unsorted[list(fc.feature_order)].values
yp = fc.model.predict(X)
data_unsorted['yp'] = yp

In [None]:
# unsorted FOVs ordered by predicted score
d = data_unsorted.sort_values(by='yp', ascending=False)
tile = viz.build_tile(
    d.loc[(d.yp < -.50)], 
    shape=(16, 16), 
    figsize=16, 
    offset=0,
    show_labels=True, label_column='yp', label_format='%0.2f')

In [None]:
# unsorted FOVs ordered by predicted score
d = data_unsorted.sort_values(by='yp', ascending=False)

bad = (d.yp < -.7)
neutral = (d.yp > -.25) & (d.yp < 0)
good = d.yp > .7

tile = viz.build_tile(
    d.loc[good], 
    shape=(10, 10), 
    figsize=16, 
    offset=0,
    show_labels=True, label_column='yp', label_format='%0.2f')

### Distribution of scores by plate

In [None]:
# all data (where yp is from the oob_score_ for the training data, and from model.prefict for unsorted data)
all_data = pd.concat((data, data_unsorted), axis=0, sort=True)
all_data = all_data.reset_index()

In [None]:
plate_dirs = glob.glob('/Users/keith.cheveralls/image-data/PlateMicroscopyProjections-uint16/*')
plate_filenames = {}
for plate_dir in plate_dirs:
    plate_id = int(plate_dir.split(os.sep)[-1].replace('mNG96wp', ''))
    filepaths = glob.glob('%s/*.tif' % plate_dir)
    filenames = [f.split(os.sep)[-1] for f in filepaths]
    plate_filenames[plate_id] = filenames
    
all_data['plate_id'] = None
for ind, row in all_data.iterrows():
    for plate_id, filenames in plate_filenames.items():
        if row.filename in filenames:
            all_data.at[ind, 'plate_id'] = plate_id
            continue

In [None]:
all_data.plate_id.unique()

In [None]:
all_data.plate_id.isna().sum()

In [None]:
all_data.groupby('plate_id').count()

### Distribution of scores for all plates (1-19 plus 1-5 thawed)

In [None]:
dfs = []
for dirpath in thawed_dirpaths:
    d = pd.read_csv('%s/2019-10-08-features.csv' % dirpath)
    d['plate_id'] = '%d-thawed' % int(dirpath.split(os.sep)[-1].replace('_Thawed', '').replace('mNG96wp', ''))
    dfs.append(d)
    
data_thawed = pd.concat(dfs, axis=0, sort=True)
data_thawed.shape

In [None]:
data_thawed = data_thawed.replace([np.inf, -np.inf], np.nan)
data_thawed = data_thawed.dropna(axis=0, how='any', subset=fc.feature_order)
data_thawed.shape

In [None]:
# predicted scores for unsorted FOVs
X = data_thawed[list(fc.feature_order)].values
yp = fc.model.predict(X)
data_thawed['yp'] = yp

In [None]:
# merge with 'all_data', which is from plates 1-19
d = pd.concat((all_data, data_thawed), axis=0, sort=True)

In [None]:
d['plate_id'] = list(map(str, d.plate_id))

In [None]:
d.plate_id.unique()

In [None]:
_ = plt.hist(d.yp, bins=np.arange(-1, 1.2, .2), density=True)

In [None]:
plate_ids = [
    [1,2,3,4,5],
    [6,7,8,9,10],
    [11,12,13,14,15],
    [16,17,18,19, None],
    ['1-thawed', '2-thawed', '3-thawed', '4-thawed', '5-thawed'],
]

fig, axs = plt.subplots(5, 5, figsize=(16, 12))
for rind, row in enumerate(axs):
    for cind, ax in enumerate(row):
        plate_id = plate_ids[rind][cind]
        if plate_id is not None:
            plate_id = str(plate_id)
            values = d.loc[d.plate_id==plate_id].yp.values
            ax.hist(values, bins=np.arange(-1, 1, .2), density=True)
            ax.set_title('Plate %s (n = %d)' % (plate_id.replace('-thawed', ' (thawed)'), len(values)))    

        ax.set_ylim([0, 3])
        ax.set_xticks([-1, -.5, 0, .5, 1])
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        if cind==0:
            ax.set_ylabel('Density')
            ax.set_yticklabels([0, 1, 2, 3])
        if rind==len(axs)-1:
            ax.set_xlabel('Score')
            ax.set_xticklabels([-1, -.5, 0, .5, 1])

In [None]:
# plot the percent 'bad'
plate_ids = list(range(1, 20)) + ['1-thawed', '2-thawed', '3-thawed', '4-thawed', '5-thawed']
plate_id_labels = list(range(1, 20)) + ['1T', '2T', '3T', '4T', '5T']
pbad, pgood = [], []
for plate_id in plate_ids:
    plate_id = str(plate_id)
    values = d.loc[d.plate_id==plate_id].yp.values
    pgood.append((values > .5).sum() / len(values))
    pbad.append((values < -.5).sum() / len(values))

In [None]:
fig, ax = plt.subplots(figsize=(12, 3))
width = 1/3
x = np.arange(len(plate_ids))
rects1 = ax.bar(x - width/2, np.array(pgood)*100, width, label='Predicted good')
rects2 = ax.bar(x + width/2, np.array(pbad)*100, width, label='Predicted bad')

ax.set_ylabel('Percent')
ax.set_title('')
ax.set_xticks(x)
ax.set_xticklabels(map(str, plate_id_labels))
ax.legend()
# plt.savefig('/Users/keith.cheveralls/Box/KCC-slides/2019-10-23_lab-meeting/good-bad-by-plate.pdf')

In [None]:
dd = d.loc[d.plate_id=='15'].copy()
dd = dd.sort_values(by='yp', ascending=False)
dd.shape, 25*25

In [None]:
# tile the FOVs from one plate
tile = viz.build_tile(
    dd, 
    shape=(20, 20), 
    figsize=25, 
    offset=0,
    show_labels=True, 
    label_column='yp', 
    label_format='%0.2f')

tifffile.imsave('/Users/keith.cheveralls/image-data/all-PQ-plate15-ordered-by-score-20x20.tif', tile)