In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import xlrd
import base64
import time
import shutil
import imageio
import requests
import datetime
import urllib
import psycopg2
import numpy as np
import pandas as pd
import skimage

import tifffile
import seaborn as sns
import matplotlib as mpl

import dask.diagnostics
import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 1

sys.path.append('../..')
%aimport opencell.imaging.managers
%aimport opencell.imaging.processors
%aimport opencell.file_utils
# %aimport opencell.database.operations

from opencell import constants, file_utils
from opencell.cli import database_cli
from opencell.cli import fov_cli
from opencell.database import models
from opencell.database import operations
from opencell.database import utils as db_utils
from opencell.imaging import utils as im_utils
from opencell.imaging import images, managers, processors, viz

In [None]:
def timeit(fn):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = fn(*args, **kwargs)
        end = time.time()
        print('%0.2f s' % (end - start))
        return result
    return wrapper

In [None]:
@contextmanager
def session_scope(url, echo=False):
    engine = db.create_engine(url, echo=echo)
    Session = db.orm.sessionmaker(bind=engine)
    session = Session()
    try:
        yield session
        session.commit()
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()

In [None]:
url = db_utils.url_from_credentials('../../db-credentials-test.json')
url = db_utils.url_from_credentials('../../db-credentials-dev.json')
#url = db_utils.url_from_credentials('../../db-credentials-cap.json')

engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)
url

In [None]:
models.Base.metadata.create_all(engine)

In [None]:
# copy a stack from ESS
shutil.copy2(
    '/Volumes/ml_group/raw-pipeline-microscopy/PML0205/raw_data/MMStack_1016-G2-16.ome.tif',
    '/Users/keith.cheveralls/image-data/MMStack_1016.tif'
)

In [None]:
# query the cell_line_metadata view
md = pd.DataFrame(
    data=Session.query(clm).all(),
    columns=[c.name for c in clm.columns]
)
md.shape

In [None]:
# visualize the schema
render_er(models.Base.metadata, '../2020-08-28-schema.png')

### Sanity checks

In [None]:
Session.rollback()

In [None]:
Session.connection

In [None]:
operations.get_or_create_plate_design(Session, 'P0001')

In [None]:
ops = operations.PolyclonalLineOperations.from_plate_well(Session, 'P0001', 'A02', 1)
ops.line

In [None]:
ops = operations.PolyclonalLineOperations.from_target_name(Session, 'VAPA')
ops.line

In [None]:
resorted_lines = Session.query(models.CellLine).filter(models.CellLine.parent_id > 1).all()
len(resorted_lines)

In [None]:
cds = Session.query(models.CrisprDesign).all()
cd = cds[-1]
len(cds)

In [None]:
cd.target_name, cd.cell_lines

In [None]:
# all cell lines
query = Session.query(models.CellLine)
lines = query.all()
len(lines)

In [None]:
rows = [(line.id, line.crispr_design.target_name if line.crispr_design else None) for line in lines]

In [None]:
# check for lines without target names
[row for row in rows if not row[1]]

In [None]:
# pd.DataFrame(data=rows, columns=['id', 'target']).to_csv('id-targets-new-prod.csv')

In [None]:
# all lines with FOVs eager-loaded
query = Session.query(models.CellLine)
query = query.options(
    db.orm.joinedload(models.CellLine.fovs, innerjoin=True)
    .joinedload(models.MicroscopyFOV.results, innerjoin=True)
)

lines = query.all()
len(lines)

In [None]:
# time various cell_line methods
def get_fovs(n):
    for line in lines[:n]:
        line.get_top_scoring_fovs(ntop=2)   
timeit(get_designs)(100)

In [None]:
# the number of crispr designs
designs = Session.query(models.CrisprDesign).all()
len(lines)

In [None]:
# all target_names
names = [row.target_name for row in Session.query(models.CrisprDesign).all()]
len(names), len(set(names))

In [None]:
# count the number of crispr_design rows per target_name
d = pd.DataFrame(data=[names, np.ones((len(names),))]).transpose()
d.columns = ['name', 'num']
dn = d.groupby('name').count().reset_index().sort_values(by='num', ascending=False)
dn

In [None]:
# number of facs and sequencing datasets
facs = Session.query(models.FACSDataset).all()
seq = Session.query(models.SequencingDataset).all()
len(facs), len(seq)

### Inspect manual target annotations

In [None]:
df = pd.read_sql(
    '''
    select cell_line_id, well_id, plate_design_id as plate_id, target_name, categories, comment
    from cell_line line
    left join crispr_design cd on cd.id = line.crispr_design_id
    left join cell_line_annotation ant on ant.cell_line_id = line.id;
    ''',
    engine
)
df.shape
df = df.sort_values(by=['plate_id', 'well_id'])

In [None]:
# number of no-gfp wells per plate
df['no_gfp'] = df.categories.apply(lambda d: 'no_gfp' in d if d else None)
df['publication_ready'] = df.categories.apply(lambda d: 'publication_ready' in d if d else None)
pd.concat(
    (df.groupby('plate_id').no_gfp.sum(), df.groupby('plate_id').publication_ready.sum()),
    axis=1
)

In [None]:
# number of no-gfp wells per plate
df['no_gfp'] = df.categories.apply(lambda d: 'no_gfp' in d if d else None)
df.groupby('plate_id').no_gfp.sum()

In [None]:
# all categories
cats = []
_ = [cats.extend(cat) for cat in df.categories if cat is not None]
cats = np.array(cats)
cats.shape

In [None]:
# all unique category names
np.array(sorted(set(cats)))

In [None]:
# categories and counts as JSON (copied to opencell-vis gallery page)
counts = [{'category': cat, 'num': (cats == cat).sum()} for cat in set(cats)]
json.loads(pd.DataFrame(data=counts).sort_values(by='num', ascending=False).to_json(orient='records'))

In [None]:
# all crispr_design families
d = pd.read_sql(
    '''
    select target_family, count(target_family) as num from crispr_design
    group by target_family
    ''',
    engine
)

In [None]:
d = d.sort_values(by='num', ascending=False)
d.head()

In [None]:
json.loads(d.iloc[:20].to_json(orient='records'))

### Microscopy sanity checks

In [None]:
# number of microscopy datasets
ds = Session.query(models.MicroscopyDataset).all()
len(ds)

In [None]:
# number of fovs
fovs = Session.query(models.MicroscopyFOV).all()
len(fovs)

In [None]:
# fovs with manual annotations
fovs = Session.query(models.MicroscopyFOV).filter(models.MicroscopyFOV.annotation.has()).all()
len(fovs)

In [None]:
fovs[1].dataset

In [None]:
# number of fov results
len(Session.query(models.MicroscopyFOVResult).all())

In [None]:
# lines with FOVs
lines = Session.query(models.CellLine).filter(models.CellLine.fovs.any()).all()
len(lines)

In [None]:
fovs = Session.query(models.MicroscopyFOV).filter(models.MicroscopyFOV.pml_id == 'PML0332').all()
len(fovs)

In [None]:
# annotated FOVs without cropped ROIs
fovs = (
    Session.query(models.MicroscopyFOV)
    .filter(models.MicroscopyFOV.annotation.has())
    .filter(~models.MicroscopyFOV.rois.any())
    .all()
)
len(fovs)

### View the z-projection and segmentation for a given fov_id

In [None]:
fovs = Session.query(models.MicroscopyFOV).filter(models.MicroscopyFOV.id == 32851).all()
p = processors.FOVProcessor.from_database(fovs[0])

im_filepath = os.path.join(
    '/Volumes/ml_group/opencell-microscopy', 
    p.dst_filepath(kind='proj', channel='405', ext='tif')
)

mask_filepath = os.path.join(
    '/Volumes/ml_group/opencell-microscopy', 
    p.dst_filepath(kind='segmentation', ext='tif')
)

im = tifffile.imread(im_filepath)
mask = tifffile.imread(mask_filepath)

In [None]:
maskl = skimage.measure.label(mask, connectivity=1)

cmap = np.array(sns.color_palette('bright', 8))[:-1, :]
order = [np.mod(ind, cmap.shape[0]) for ind in range(maskl.max())]
colors = [(0, 0, 0)] + list(cmap[order])

im_rgb = skimage.color.label2rgb(maskl, image=im_utils.autogain(im, p=.3), colors=colors)
viz.imshow(im_rgb, figsize=8)

### Mass spec sanity checks

In [None]:
# lines with pulldowns
lines = Session.query(models.CellLine).filter(models.CellLine.pulldowns.any()).all()
len(lines)

In [None]:
# lines without any pulldowns
lines = (
    Session.query(models.CellLine)
    .filter(models.CellLine.line_type == 'POLYCLONAL')
    .filter(~models.CellLine.pulldowns.any())
    .options(
        db.orm.joinedload(models.CellLine.crispr_design, innerjoin=True)
    )
    .all()
)
targets_wo_ms = [line.crispr_design.target_name for line in lines]
len(lines)

In [None]:
# one line
line = Session.query(models.CellLine).filter(models.CellLine.id == 460).one()

In [None]:
line.pulldowns

In [None]:
pulldown = line.pulldowns[2]
pulldown.id, len(pulldown.hits)

In [None]:
d = [hit.as_dict() for hit in pd.hits]
d[:10]

In [None]:
line = operations.PolyclonalLineOperations.from_target_name(Session, 'CLTA').line
line.id

### Mass spec cluster heatmaps

In [None]:
def get_cluster_ids(pulldown):
    rows = (
        Session.query(db.distinct(models.MassSpecClusterHeatmap.cluster_id))
        .join(models.MassSpecClusterHeatmap.hit)
        .join(models.MassSpecHit.pulldown)
        .filter(models.MassSpecPulldown.id == pulldown.id)
        .all()
    )
    cluster_ids = [row[0] for row in rows]
    return cluster_ids

In [None]:
get_cluster_ids(line.pulldowns[1])

In [None]:
d = pd.read_sql('select * from mass_spec_cluster_heatmap heatmap', engine)

In [None]:
# sort clusters by number of rows
d.groupby(['cluster_id']).max().reset_index().sort_values(by='col_index', ascending=False).head(11)

In [None]:
d = pd.read_sql(
    '''
    select * from mass_spec_cluster_heatmap heatmap
    left join mass_spec_hit hit on hit.id = heatmap.hit_id
    left join mass_spec_pulldown pd on pd.id = hit.pulldown_id
    left join cell_line on cell_line.id = pd.cell_line_id
    left join crispr_design cd on cd.id = cell_line.crispr_design_id
    ''',
    engine
)
d = d.drop(labels='id', axis=1)

In [None]:
# target names and clusters with num rows
cluster_targets = pd.merge(
    d.groupby(['cluster_id', 'crispr_design_id']).target_name.first().reset_index(),
    d.groupby(['cluster_id']).row_index.max().reset_index(),
    on='cluster_id'
).sort_values(by='row_index')

In [None]:
cluster_targets.to_csv('clusters-crispr-designs.csv')

### Determine how many FOVs were inserted (from the PlateMicroscopy directory only)

In [None]:
pm.md_raw.shape, len(session.query(models.MicroscopyFOV).all())

In [None]:
# FOVs for controls are not inserted
num_controls = pm.md_raw.loc[pm.md_raw.well_id.isin(['A01', 'H12'])].shape[0]
num_controls

In [None]:
# FOVs from PML0084 and PML0108 are not inserted (because these acquistions were not truly pipeline)
pml_ids = [row.pml_id for row in session.query(models.MicroscopyDataset).all()]
num_nonpipeline = pm.md_raw.loc[~pm.md_raw.pml_id.isin(pml_ids)].shape[0]
num_nonpipeline

In [None]:
# the uninserted FOVs are likely the Jin samples that are not yet in the database
# (these are mostly in Plate6 column E)
pm.md_raw.shape[0] - num_controls - num_nonpipeline, len(session.query(models.MicroscopyFOV).all())

In [None]:
fov = ops.PolyclonalLineOperations.from_plate_well(session, 'P0019', 'H11').cell_line.microscopy_fovs[0]

### FOVs from raw-pipeline-microscopy datasets

In [None]:
dataset = (
    Session.query(models.MicroscopyDataset)
    .filter(models.MicroscopyDataset.pml_id == 'PML0265')
    .first()
)
len(dataset.fovs)

### Top-scoring FOVs

In [None]:
d = pd.read_sql('''
    select fov.cell_line_id, fov.id as fov_id, (data::json ->> 'score')::float as score
    from microscopy_fov fov
    left join microscopy_fov_result result on fov.id = result.fov_id
    where result.kind = 'fov-features';''',
    engine)

In [None]:
# the index of the top-scoring FOV for each cell line
inds = d.groupby('cell_line_id').score.idxmax(axis=0)

In [None]:
top_fovs = d.iloc[inds.loc[inds.notna()]]

In [None]:
plt.plot(top_fovs.sort_values(by='score').score.values)

In [None]:
plt.figure(figsize=(8, 6))
_ = plt.hist(d.groupby('cell_line_id').fov_id.count(), bins=np.arange(0, 40, 2))
plt.gca().set_xlabel('Number of FOVs per target', fontsize=18)
plt.gca().set_ylabel('Number of targets', fontsize=18)
plt.savefig('/Users/keith.cheveralls/Downloads/fovs_per_target.pdf')

### Inspect aggregated FOV results

In [None]:
Session.rollback()

In [None]:
def all_processing_events():
    '''
    This method is specific to aggregating processing events
    because the JSON in the data column for processing events is a list, not a dict
    '''
    results = Session.query(models.MicroscopyFOVResult)\
        .filter(models.MicroscopyFOVResult.kind == 'raw-tiff-processing-events').all()  
    data = [
        [{
            'fov_id': result.fov.id, 
            'line_id': result.fov.cell_line_id, 
            'pml_id': result.fov.dataset.pml_id,
            **row
        } for row in result.data] 
        for result in results
    ]
    rows = []
    [rows.extend(row) for row in data]
    df = pd.DataFrame(data=rows)
    return df

In [None]:
def all_results(kind):
    '''
    Aggregate results whose data column is a dict (not a list)
    '''
    results = (
        Session.query(models.MicroscopyFOVResult)
        .filter(models.MicroscopyFOVResult.kind == kind)
    ).all()  
    
    data = [{
        'fov_id': result.fov.id, 
        'line_id': result.fov.cell_line_id, 
        'pml_id': result.fov.dataset.pml_id,
        **result.data
    } for result in results]

    df = pd.DataFrame(data=data)
    return df

In [None]:
def all_results_fast(kind):
    query = '''
        select fov.*, res.kind as kind, res.data as data from microscopy_fov fov
        left join (select * from microscopy_fov_result where kind = '%s') res 
        on fov.id = res.fov_id;'''
    df = pd.read_sql(query % kind, engine)
    return df

In [None]:
# all processing events grouped by message
df = all_processing_events()
df.shape

In [None]:
df.groupby('message').count()

### Inspect z-profiles and clean-tiff-metadata

In [None]:
df = all_results('z-profiles')
df.shape

In [None]:
df = all_results_fast('clean-tiff-metadata')
df.shape

In [None]:
# merge the results JSON column into the dataframe
df = df.merge(pd.DataFrame(data=list(df.data)), left_index=True, right_index=True)

In [None]:
# count the kinds of errors
df.loc[~df.error.isna()].groupby('error').count().id

In [None]:
# all FOV features
df = all_results_fast('fov-features')
df['score'] = [data.get('score') for data in df.data]
df.shape, len(set(df.cell_line_id)), df.groupby('cell_line_id').score.nlargest(1).shape

In [None]:
df.head()

In [None]:
# number of cell lines with no score-able FOVs
df.groupby('cell_line_id').score.max().isna().sum()

In [None]:
# the four highest-scoring FOVs for each cell_line
top = df.sort_values(by=['cell_line_id', 'score'], ascending=False).groupby('cell_line_id').head(2)
top.shape

In [None]:
# lines with updated top-two FOVs 
# (22778 is the minimum fov_id in PML0236, 
# which was the first of the pml_ids added (in order) in a large update on 2020-03-27)
max_fov_id = top.groupby('cell_line_id').max().id
updated_lines = max_fov_id[max_fov_id > 22778]
updated_lines.shape

In [None]:
Session.rollback()

In [None]:
md = pd.DataFrame(
    data=Session.query(clm).all(),
    columns=[c.name for c in clm.columns]
)
md.shape

In [None]:
mdd = md.loc[md.cell_line_id.isin(updated_lines.index)]
mdd.to_csv('2020-03-28_lines-with-updated-fovs.csv')

In [None]:
mdd.groupby('plate_id').well_id.count()