In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import xlrd
import base64
import time
import shutil
import imageio
import requests
import datetime
import psycopg2
import numpy as np
import pandas as pd
import skimage

import dask.diagnostics
import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 1

sys.path.append('../..')
%aimport opencell.imaging.managers
%aimport opencell.imaging.processors
%aimport opencell.file_utils
# %aimport opencell.database.operations

from opencell import constants, file_utils
from opencell.cli import database as db_cli
from opencell.cli import imaging as imaging_cli
from opencell.database import models
from opencell.database import operations
from opencell.database import utils as db_utils
from opencell.imaging import utils as im_utils
from opencell.imaging import images, managers, processors, viz

In [None]:
def timeit(fn):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = fn(*args, **kwargs)
        end = time.time()
        print('%0.2f s' % (end - start))
        return result
    return wrapper

In [None]:
url = db_utils.url_from_credentials('../../db-credentials-test.json')
url = db_utils.url_from_credentials('../../db-credentials-dev.json')
# url = db_utils.url_from_credentials('../../db-credentials-cap.json')

engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)
url

In [None]:
models.Base.metadata.create_all(engine)

In [None]:
# copy a stack from ESS
shutil.copy2(
    '/Volumes/ml_group/raw-pipeline-microscopy/PML0205/raw_data/MMStack_1016-G2-16.ome.tif',
    '/Users/keith.cheveralls/image-data/MMStack_1016.tif'
)

In [None]:
# query the cell_line_metadata view
md = pd.DataFrame(
    data=Session.query(clm).all(),
    columns=[c.name for c in clm.columns]
)
md.shape

In [None]:
# visualize the schema
render_er(models.Base.metadata, '../test-schema.png')

### Sanity checks

In [None]:
Session.rollback()

In [None]:
operations.PlateOperations.from_id(Session, 'P0001').plate_design

In [None]:
ops = operations.PolyclonalLineOperations.from_plate_well(Session, 'P0001', 'A02')
ops.line

In [None]:
ops = operations.PolyclonalLineOperations.from_target_name(Session, 'VAPA')
ops.line

In [None]:
Session.query(models.CellLine).filter(models.CellLine.id == 1).all()

In [None]:
cds = Session.query(models.CrisprDesign).all()
cd = cds[-1]
len(cds)

In [None]:
cds[1].cell_lines

In [None]:
# all cell lines
query = Session.query(models.CellLine)
lines = query.all()
len(lines)

In [None]:
rows = [(line.id, line.crispr_design.target_name if line.crispr_design else None) for line in lines]

In [None]:
# check for lines without target names
[row for row in rows if not row[1]]

In [None]:
# pd.DataFrame(data=rows, columns=['id', 'target']).to_csv('id-targets-new.csv')

In [None]:
# all lines with FOVs eager-loaded
query = Session.query(models.CellLine)
query = query.options(
    db.orm.joinedload(models.CellLine.fovs, innerjoin=True)
    .joinedload(models.MicroscopyFOV.results, innerjoin=True)
)

lines = query.all()
len(lines)

In [None]:
# time various cell_line methods
def get_fovs(n):
    for line in lines[:n]:
        line.get_top_scoring_fovs(ntop=2)   
timeit(get_designs)(100)

In [None]:
# the number of crispr designs
designs = Session.query(models.CrisprDesign).all()
len(lines)

In [None]:
# all target_names
names = [row.target_name for row in Session.query(models.CrisprDesign).all()]
len(names), len(set(names))

In [None]:
[
    row.as_dict() 
    for row in (
        Session.query(models.CrisprDesign)
        .filter(models.CrisprDesign.target_name == 'c12orf66')
        .all()
    )
]

In [None]:
# count the number of CrisprDesign rows per target_name
d = pd.DataFrame(data=[names, np.ones((len(names),))]).transpose()
d.columns = ['name', 'num']
dn = d.groupby('name').count().reset_index().sort_values(by='num', ascending=False)
dn

In [None]:
# number of facs and sequencing datasets
facs = Session.query(models.FACSDataset).all()
seq = Session.query(models.SequencingDataset).all()
len(facs), len(seq)

### Microscopy sanity checks

In [None]:
# number of microscopy datasets
ds = Session.query(models.MicroscopyDataset).all()
len(ds)

In [None]:
# number of fovs
fovs = Session.query(models.MicroscopyFOV).all()
len(fovs)

In [None]:
# fovs with manual annotations
fovs = Session.query(models.MicroscopyFOV).filter(models.MicroscopyFOV.annotation.has()).all()
len(fovs)

In [None]:
fovs[1].dataset

In [None]:
# number of fov results
len(Session.query(models.MicroscopyFOVResult).all())

In [None]:
# lines with FOVs
lines = Session.query(models.CellLine).filter(models.CellLine.fovs.any()).all()
len(lines)

### Mass spec sanity checks

In [None]:
# lines with pulldowns
lines = Session.query(models.CellLine).filter(models.CellLine.mass_spec_pulldowns.any()).all()
len(lines)

In [None]:
pds = lines[0].mass_spec_pulldowns
pds

In [None]:
len(pds[0].hits)

In [None]:
d = [hit.as_dict() for hit in pds[0].hits]

In [None]:
d[:10]

In [None]:
Session.close()

In [None]:
Session.rollback()

In [None]:
Session.commit()

### Determine how many FOVs were inserted (from the PlateMicroscopy directory only)

In [None]:
pm.md_raw.shape, len(session.query(models.MicroscopyFOV).all())

In [None]:
# FOVs for controls are not inserted
num_controls = pm.md_raw.loc[pm.md_raw.well_id.isin(['A01', 'H12'])].shape[0]
num_controls

In [None]:
# FOVs from PML0084 and PML0108 are not inserted (because these acquistions were not truly pipeline)
pml_ids = [row.pml_id for row in session.query(models.MicroscopyDataset).all()]
num_nonpipeline = pm.md_raw.loc[~pm.md_raw.pml_id.isin(pml_ids)].shape[0]
num_nonpipeline

In [None]:
# the uninserted FOVs are likely the Jin samples that are not yet in the database
# (these are mostly in Plate6 column E)
pm.md_raw.shape[0] - num_controls - num_nonpipeline, len(session.query(models.MicroscopyFOV).all())

In [None]:
fov = ops.PolyclonalLineOperations.from_plate_well(session, 'P0019', 'H11').cell_line.microscopy_fovs[0]

### Inserting FOVs from new raw-pipeline-microscopy datasets

In [None]:
Session.rollback()

In [None]:
pml_ids = [
    'PML0241', 'PML0242', 'PML0243', 'PML0245', 'PML0246', 'PML0247',
    'PML0248', 'PML0249', 'PML0250', 'PML0251', 'PML0252', 'PML0253', 'PML0254',
    'PML0255', 'PML0256', 'PML0257', 'PML0258', 'PML0259', 'PML0261', 'PML0262',
    'PML0263', 'PML0264', 'PML0265'
]

for pml_id in pml_ids:
    imaging_cli.insert_raw_pipeline_microscopy_fovs(
        Session, '/Volumes/ml_group/raw-pipeline-microscopy', pml_id=pml_id, errors='warn')

### FOVs from raw-pipeline-microscopy datasets

In [None]:
dataset = Session.query(models.MicroscopyDataset).filter(models.MicroscopyDataset.pml_id == 'PML0265').first()
len(dataset.fovs)

### Debugging FOVProcessor

In [None]:
src_root = '/Users/keith.cheveralls/opencell-test/data/PlateMicroscopy/'
dst_root = '/Users/keith.cheveralls/opencell-test/output/opencell-microscopy/'

In [None]:
src_root = '/Volumes/ml_group/PlateMicroscopy/'
dst_root = '/Users/keith.cheveralls/image-data/'

In [None]:
fovs = session.query(models.MicroscopyFOV).all()
p = processors.FOVProcessor.from_database(fovs[0])
len(fovs), p.target_name

In [None]:
# FOVProcessor with an fov_id
fov = Session.query(models.MicroscopyFOV).filter(models.MicroscopyFOV.id == 24108).first()
p.set_src_roots(plate_microscopy_dir='/Volumes/ml_group/PlateMicroscopy/')
p = processors.FOVProcessor.from_database(fov)

In [None]:
# FOVProcessor given an roi_id
roi_id = 92236 
roi_id = 103070

roi = Session.query(models.MicroscopyFOVROI).filter(models.MicroscopyFOVROI.id == roi_id).first()
p = processors.FOVProcessor.from_database(roi.fov)
p.set_src_roots(plate_microscopy_dir='/Volumes/ml_group/PlateMicroscopy/')

# path to the ROI file
filepath = p.dst_filepath(
    dst_root='',
    roi_id=roi_id,
    channel='405',
    kind='crop',
    ext='jpg')

filepath

In [None]:
tiff = p.load_raw_tiff()

In [None]:
plt.imshow(tiff.stacks['488'][0, :, :])

In [None]:
aligned_stacks, result = tiff.align_cell_layer(-5, 6, 0.5, 1)

In [None]:
plt.imshow(aligned_stacks['488'][0, :, :])

In [None]:
# shutil.copy('/Volumes/ml_group/opencell-microscopy/%s' % filepath, 'tmp.jpg')
im = imageio.imread('tmp.jpg')
im.shape, im.shape[0]/600

In [None]:
z = 54
plt.imshow(im[600*z:600*(z+1), :])

In [None]:
roi_props = {
    'shape': [600, 600, 22],
    'position': [130, 256, 0],
    'xy_coords': [130, 256, 600, 600],
    'target_step_size': 0.2,
    'original_step_size': 0.5,
    'required_num_slices': 55,
}

roi_props = p.crop_and_save_roi(
    roi_props, aligned_stacks, dst_root='/Users/keith.cheveralls/image-data/')

In [None]:
num_rows, num_cols, num_z = roi_props['shape']
row_ind, col_ind, z_ind = roi_props['position']

stack = aligned_stacks['405']
cropped_stack = stack[
    z_ind:(z_ind + num_z),
    row_ind:(row_ind + num_rows),
    col_ind:(col_ind + num_cols)
].copy()

In [None]:
plt.imshow(cropped_stack[0, :, :])

In [None]:
cropped_stack, did_resample_stack = p.maybe_resample_stack(
    np.moveaxis(cropped_stack, 0, -1),
    original_step_size=0.5,
    target_step_size=0.2,
    required_num_slices=55)

In [None]:
plt.imshow(cropped_stack[:, :, 0])

In [None]:
cropped_stack, min_intensity, max_intensity = p.stack_to_uint8(
    cropped_stack, percentile=0.01)

In [None]:
cropped_stack = np.moveaxis(cropped_stack, -1, 0)
tile = np.concatenate([zslice for zslice in cropped_stack], axis=0)
imageio.imsave('tmp-new.jpg', tile, format='jpg', quality=90)

In [None]:
im = imageio.imread('tmp-new.jpg')
z = 0
plt.imshow(im[600*z:600*(z+1), :])

### Top-scoring FOVs

In [None]:
d = pd.read_sql('''
    select fov.cell_line_id, fov.id as fov_id, (data::json ->> 'score')::float as score
    from microscopy_fov fov
    left join microscopy_fov_result result on fov.id = result.fov_id
    where result.kind = 'fov-features';''',
    engine)

In [None]:
# the index of the top-scoring FOV for each cell line
inds = d.groupby('cell_line_id').score.idxmax(axis=0)

In [None]:
top_fovs = d.iloc[inds.loc[inds.notna()]]

In [None]:
plt.plot(top_fovs.sort_values(by='score').score.values)

In [None]:
plt.figure(figsize=(8, 6))
_ = plt.hist(d.groupby('cell_line_id').fov_id.count(), bins=np.arange(0, 40, 2))
plt.gca().set_xlabel('Number of FOVs per target', fontsize=18)
plt.gca().set_ylabel('Number of targets', fontsize=18)
plt.savefig('/Users/keith.cheveralls/Downloads/fovs_per_target.pdf')

### Inspect aggregated FOV results

In [None]:
Session.rollback()

In [None]:
def all_processing_events():
    '''
    This method is specific to aggregating processing events
    because the JSON in the data column for processing events is a list, not a dict
    '''
    results = Session.query(models.MicroscopyFOVResult)\
        .filter(models.MicroscopyFOVResult.kind == 'raw-tiff-processing-events').all()  
    data = [
        [{
            'fov_id': result.fov.id, 
            'line_id': result.fov.cell_line_id, 
            'pml_id': result.fov.dataset.pml_id,
            **row
        } for row in result.data] 
        for result in results
    ]
    rows = []
    [rows.extend(row) for row in data]
    df = pd.DataFrame(data=rows)
    return df

In [None]:
def all_results(kind):
    '''
    Aggregate results whose data column is a dict (not a list)
    '''
    results = (
        Session.query(models.MicroscopyFOVResult)
        .filter(models.MicroscopyFOVResult.kind == kind)
    ).all()  
    
    data = [{
        'fov_id': result.fov.id, 
        'line_id': result.fov.cell_line_id, 
        'pml_id': result.fov.dataset.pml_id,
        **result.data
    } for result in results]

    df = pd.DataFrame(data=data)
    return df

In [None]:
def all_results_fast(kind):
    query = '''
        select fov.*, res.kind as kind, res.data as data from microscopy_fov fov
        left join (select * from microscopy_fov_result where kind = '%s') res 
        on fov.id = res.fov_id;'''
    df = pd.read_sql(query % kind, engine)
    return df

In [None]:
# all processing events grouped by message
df = all_processing_events()
df.shape

In [None]:
df.groupby('message').count()

### List of FOVs without nuclei for Hiro

In [None]:
df = all_results('fov-features')
df.shape

In [None]:
fov_ids = list(df.loc[(df.num_nuclei.isna())].fov_id)
fovs = Session.query(models.MicroscopyFOV).filter(models.MicroscopyFOV.id.in_(fov_ids))
ps = [processors.FOVProcessor.from_database(fov) for fov in fovs]

In [None]:
d = pd.Series([p.dst_filepath(kind='clean', ext='tif').split(os.sep)[-1] for p in ps])
d.to_csv('/Volumes/ml_group/opencell-microscopy/2020-02-10_clean-tiffs-without-nuclei.csv', header=True, index=False)

### Inspect z-profiles, clean-tiff-metadata

In [None]:
df = all_results('z-profiles')
df.shape

In [None]:
df = all_results_fast('clean-tiff-metadata')
df.shape

In [None]:
# merge the results JSON column into the dataframe
df = df.merge(pd.DataFrame(data=list(df.data)), left_index=True, right_index=True)

In [None]:
# count the kinds of errors
df.loc[~df.error.isna()].groupby('error').count().id

In [None]:
# all FOV features
df = all_results_fast('fov-features')
df['score'] = [data.get('score') for data in df.data]
df.shape, len(set(df.cell_line_id)), df.groupby('cell_line_id').score.nlargest(1).shape

In [None]:
df.head()

In [None]:
# number of cell lines with no score-able FOVs
df.groupby('cell_line_id').score.max().isna().sum()

In [None]:
# the four highest-scoring FOVs for each cell_line
top = df.sort_values(by=['cell_line_id', 'score'], ascending=False).groupby('cell_line_id').head(2)
top.shape

In [None]:
# lines with updated top-two FOVs 
# (22778 is the minimum fov_id in PML0236, 
# which was the first of the pml_ids added (in order) in a large update on 2020-03-27)
max_fov_id = top.groupby('cell_line_id').max().id
updated_lines = max_fov_id[max_fov_id > 22778]
updated_lines.shape

In [None]:
Session.rollback()

In [None]:
md = pd.DataFrame(
    data=Session.query(clm).all(),
    columns=[c.name for c in clm.columns]
)
md.shape

In [None]:
mdd = md.loc[md.cell_line_id.isin(updated_lines.index)]
mdd.to_csv('2020-03-28_lines-with-updated-fovs.csv')

In [None]:
mdd.groupby('plate_id').well_id.count()