In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import xlrd
import base64
import time
import imageio
import requests
import datetime
import psycopg2
import numpy as np
import pandas as pd
import skimage

import dask.diagnostics
import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 1

sys.path.append('../..')
%aimport opencell.imaging.managers
%aimport opencell.imaging.processors
%aimport opencell.file_utils
# %aimport opencell.database.operations

from opencell import constants, file_utils
from opencell.cli import database as db_cli
from opencell.cli import imaging as imaging_cli
from opencell.database import models
from opencell.database import operations
from opencell.database import utils as db_utils
from opencell.imaging import utils as im_utils
from opencell.imaging import images, managers, processors, viz

In [None]:
def timeit(fn):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = fn(*args, **kwargs)
        end = time.time()
        print('%0.2f s' % (end - start))
        return result
    return wrapper

In [None]:
url = db_utils.url_from_credentials('../../db-credentials-dev.json')
# url = db_utils.url_from_credentials('../../db-credentials-cap.json')
url

### Sanity checks

In [None]:
engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)
session = Session()

# reflect the cell_line_metadata view
clm = db.Table("cell_line_metadata", models.Base.metadata, autoload=True, autoload_with=engine)
fov_rank = db.Table("fov_rank", models.Base.metadata, autoload=True, autoload_with=engine)

In [None]:
ops = operations.PolyclonalLineOperations.from_plate_well(Session, 'P0001', 'A01')

In [None]:
# query the cell_line_metadata view
md = pd.DataFrame(
    data=Session.query(clm).all(),
    columns=[c.name for c in clm.columns]
)
md.shape

In [None]:
Session.query(models.CellLine).filter(models.CellLine.line_type=='PROGENITOR').first().name

In [None]:
# all cell lines
query = Session.query(models.CellLine)
lines = query.all()
len(lines)

In [None]:
# all lines with FOVs eager-loaded
query = Session.query(models.CellLine)
query = query.options(
    db.orm.joinedload(models.CellLine.fovs, innerjoin=True)
    .joinedload(models.MicroscopyFOV.results, innerjoin=True)
)

lines = query.all()
len(lines)

In [None]:
# time various cell_line methods

def get_designs(n):
    for line in lines[:n]:
        line.get_crispr_design()
        
def get_fovs(n):
    for line in lines[:n]:
        line.get_top_scoring_fovs(ntop=2)
        
timeit(get_designs)(100)

In [None]:
# the number of crispr designs
designs = session.query(models.CrisprDesign).all()
len(lines)

In [None]:
# all target_names
names = [row.target_name for row in session.query(models.CrisprDesign).all()]
len(names), len(set(names))

In [None]:
[row.as_dict() for row in session.query(models.CrisprDesign).filter(models.CrisprDesign.target_name == 'c12orf66').all()]

In [None]:
# count the number of CrisprDesign rows per target_name
d = pd.DataFrame(data=[names, np.ones((len(names),))]).transpose()
d.columns = ['name', 'num']
dn = d.groupby('name').count().reset_index().sort_values(by='num', ascending=False)
dn

In [None]:
# number of facs and sequencing datasets
facs = Session.query(models.FACSDataset).all()
seq = Session.query(models.SequencingDataset).all()
len(facs), len(seq)

In [None]:
# number of fovs
fovs = Session.query(models.MicroscopyFOV).all()
len(fovs)

In [None]:
# number of fov results
len(Session.query(models.MicroscopyFOVResult).all())

In [None]:
# lines with FOVs
lines = [line for line in Session.query(models.CellLine).all() if line.fovs]
ops.PolyclonalLineOperations(lines[2]).get_top_scoring_fovs(session, ntop=1)[0].results[1].data

In [None]:
session.close()

In [None]:
session.rollback()

In [None]:
session.commit()

In [None]:
# visualize the schema
render_er(models.Base.metadata, '../test-schema.png')

### Determine how many FOVs were inserted (from the PlateMicroscopy directory only)

In [None]:
pm.md_raw.shape, len(session.query(models.MicroscopyFOV).all())

In [None]:
# FOVs for controls are not inserted
num_controls = pm.md_raw.loc[pm.md_raw.well_id.isin(['A01', 'H12'])].shape[0]
num_controls

In [None]:
# FOVs from PML0084 and PML0108 are not inserted (because these acquistions were not truly pipeline)
pml_ids = [row.pml_id for row in session.query(models.MicroscopyDataset).all()]
num_nonpipeline = pm.md_raw.loc[~pm.md_raw.pml_id.isin(pml_ids)].shape[0]
num_nonpipeline

In [None]:
# the uninserted FOVs are likely the Jin samples that are not yet in the database
# (these are mostly in Plate6 column E)
pm.md_raw.shape[0] - num_controls - num_nonpipeline, len(session.query(models.MicroscopyFOV).all())

In [None]:
fov = ops.PolyclonalLineOperations.from_plate_well(session, 'P0019', 'H11').cell_line.microscopy_fovs[0]

### FOVs from raw-pipeline-microscopy datasets

In [None]:
dataset = Session.query(models.MicroscopyDataset).filter(models.MicroscopyDataset.pml_id == 'PML0235').first()
len(dataset.fovs)

### Testing inserting FOV ROIs

In [None]:
src_root = '/Users/keith.cheveralls/opencell-test/data/PlateMicroscopy/'
dst_root = '/Users/keith.cheveralls/opencell-test/output/opencell-microscopy/'

In [None]:
fovs = session.query(models.MicroscopyFOV).all()
p = processors.FOVProcessor.from_database(fovs[0])
len(fovs), p.target_name

In [None]:
p.crop_corner_rois(
    '/Users/keith.cheveralls/opencell-test/data/PlateMicroscopy/', 
    '/Users/keith.cheveralls/opencell-test/output/opencell-microscopy/'
)

In [None]:
session.close()

### Top-scoring FOVs

In [None]:
d = pd.read_sql('''
    select fov.cell_line_id, fov.id as fov_id, (data::json ->> 'score')::float as score
    from microscopy_fov fov
    left join microscopy_fov_result result on fov.id = result.fov_id
    where result.kind = 'fov-features';''',
    engine)

In [None]:
# the index of the top-scoring FOV for each cell line
inds = d.groupby('cell_line_id').score.idxmax(axis=0)

In [None]:
top_fovs = d.iloc[inds.loc[inds.notna()]]

In [None]:
plt.plot(top_fovs.sort_values(by='score').score.values)

In [None]:
_ = plt.hist(d.groupby('cell_line_id').fov_id.count(), bins=np.arange(4, 30, 1))

### Inspect aggregated FOV results

In [None]:
Session.rollback()

In [None]:
def all_processing_events():
    '''
    This method is specific to aggregating processing events
    because the JSON in the data column for processing events is a list, not a dict
    '''
    results = Session.query(models.MicroscopyFOVResult)\
        .filter(models.MicroscopyFOVResult.kind == 'raw-tiff-processing-events').all()  
    data = [
        [{
            'fov_id': result.fov.id, 
            'line_id': result.fov.cell_line_id, 
            'pml_id': result.fov.dataset.pml_id,
            **row
        } for row in result.data] 
        for result in results
    ]
    rows = []
    [rows.extend(row) for row in data]
    df = pd.DataFrame(data=rows)
    return df

In [None]:
def all_results(kind):
    '''
    Aggregate results whose data column is a dict (not a list)
    '''
    results = Session.query(models.MicroscopyFOVResult)\
        .filter(models.MicroscopyFOVResult.kind == kind).all()  
    data = [{
        'fov_id': result.fov.id, 
        'line_id': result.fov.cell_line_id, 
        'pml_id': result.fov.dataset.pml_id,
        **result.data
    } for result in results]
    df = pd.DataFrame(data=data)
    return df

In [None]:
def all_results_fast(kind):
    query = '''
        select fov.*, res.kind as kind, res.data as data from microscopy_fov fov
        left join (select * from microscopy_fov_result where kind = '%s') res 
        on fov.id = res.fov_id;'''
    df = pd.read_sql(query % kind, engine)
    return df

In [None]:
# all processing events grouped by message
df = all_processing_events()
df.shape

In [None]:
df.groupby('message').count()

### List of FOVs without nuclei for Hiro

In [None]:
df = all_results('fov-features')
df.shape

In [None]:
fov_ids = list(df.loc[(df.num_nuclei.isna())].fov_id)
fovs = Session.query(models.MicroscopyFOV).filter(models.MicroscopyFOV.id.in_(fov_ids))
ps = [processors.FOVProcessor.from_database(fov) for fov in fovs]

In [None]:
d = pd.Series([p.dst_filepath(kind='clean', ext='tif').split(os.sep)[-1] for p in ps])
d.to_csv('/Volumes/ml_group/opencell-microscopy/2020-02-10_clean-tiffs-without-nuclei.csv', header=True, index=False)

### Inspect z-profiles, clean-tiff-metadata

In [None]:
df = all_results('z-profiles')
df.shape

In [None]:
df = all_results_fast('clean-tiff-metadata')
df.shape

In [None]:
# merge the results JSON column into the dataframe
df = df.merge(pd.DataFrame(data=list(df.data)), left_index=True, right_index=True)

In [None]:
# count the kinds of errors
df.loc[~df.error.isna()].groupby('error').count().id

In [None]:
# all FOV features
df = all_results('fov-features')
df.shape, len(set(df.line_id)), df.groupby('line_id').score.nlargest(1).shape

In [None]:
# number of cell lines with no score-able FOVs
df.groupby('line_id').score.max().isna().sum()

In [None]:
# the four highest-scoring FOVs for each cell_line
top4 = df.sort_values(by=['line_id', 'score'], ascending=False).groupby('line_id').head(4)
top4.shape