In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import xlrd
import base64
import imageio
import requests
import datetime
import numpy as np
import pandas as pd
# from eralchemy import render_er

import dask.diagnostics
import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 1

sys.path.append('..')
%aimport opencell.imaging
%aimport opencell.imaging.managers
%aimport opencell.imaging.processors
%aimport opencell.file_utils
%aimport opencell.database.operations

from opencell import constants, file_utils
from opencell.cli import database as db_cli
from opencell.cli import imaging as imaging_cli
from opencell.database import models
from opencell.database import operations as ops
from opencell.database import utils as db_utils
from opencell.imaging import utils as im_utils
from opencell.imaging import images, managers, processors, viz

In [None]:
set(map(db_utils.format_plate_design_id, [123, 'plate123', 'Plate 123', 'P0123']))

In [None]:
url = db_utils.url_from_credentials('../db-credentials-local-test.json')

In [None]:
url = db_utils.url_from_credentials('../db-credentials-cap.json')
url

In [None]:
requests.get('http://cap.czbiohub.org:5001/lines')

### Create and maybe populate the database

In [None]:
# manually drop and create the schema
engine = db.create_engine(url)

im_sure = False
if im_sure:
    print('Dropping all tables')
    models.Base.metadata.drop_all(engine)

print('Creating all tables')
models.Base.metadata.create_all(engine)

In [None]:
# drop, create, and populate the database
# TODO: fewer and less verbose warnings when drop_all=False
im_sure = False
if im_sure:
    db_cli.populate(url, drop_all=False, errors='ignore')

### Sanity checks

In [None]:
engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)

In [None]:
session = Session()

In [None]:
Session.query(models.CellLine).filter(models.CellLine.line_type=='PROGENITOR').first().name

In [None]:
# number of cell lines
lines = session.query(models.CellLine).all()
len(lines)

In [None]:
# all target_names
names = [row.target_name for row in session.query(models.CrisprDesign).all()]
len(names), len(set(names))

In [None]:
[row.as_dict() for row in session.query(models.CrisprDesign).filter(models.CrisprDesign.target_name == 'c12orf66').all()]

In [None]:
# count the number of CrisprDesign rows per target_name
d = pd.DataFrame(data=[names, np.ones((len(names),))]).transpose()
d.columns = ['name', 'num']
dn = d.groupby('name').count().reset_index().sort_values(by='num', ascending=False)
dn

In [None]:
# number of facs and sequencing datasets
facs = Session.query(models.FACSDataset).all()
seq = Session.query(models.SequencingDataset).all()
len(facs), len(seq)

In [None]:
# number of fovs
fovs = Session.query(models.MicroscopyFOV).all()
len(fovs)

In [None]:
p = processors.FOVProcessor.from_database(fovs[-1])
p.set_src_roots('', '/Volumes/ml_group/raw-pipeline-microscopy/')
p.src_filepath()

In [None]:
profiles = ops.to_jsonable(p.calculate_z_profiles())

In [None]:
# number of fov results
len(Session.query(models.MicroscopyFOVResult).all())

In [None]:
# lines with FOVs
lines = [line for line in Session.query(models.CellLine).all() if line.fovs]
ops.PolyclonalLineOperations(lines[2]).get_top_scoring_fovs(session, ntop=1)[0].results[1].data

### Group FOV results by pml_id or line_id

In [None]:
kind = 'raw-tiff-metadata'
kind = 'raw-tiff-processing-events'
results = Session.query(
    models.MicroscopyFOVResult).filter(models.MicroscopyFOVResult.kind == kind).all()
len(results)

In [None]:
data = [[{
    'fov_id': result.fov.id, 
    'line_id': result.fov.cell_line_id, 
    'pml_id': result.fov.dataset.pml_id,
    **row
} for row in result.data] for result in results]

In [None]:
rows = []
[rows.extend(row) for row in data]
df = pd.DataFrame(data=rows)
df.shape

In [None]:
df.groupby('message').count()

In [None]:
len(set(df.line_id)), df.groupby('line_id').score.nlargest(1).shape

In [None]:
# number of cell lines with no score-able FOVs
df.groupby('line_id').score.max().isna().sum()

In [None]:
# the four highest-scoring FOVs for each cell_line
top4 = df.sort_values(by=['line_id', 'score'], ascending=False).groupby('line_id').head(4)
top4.shape

In [None]:
df.groupby('line_id').count()

In [None]:
session.close()

In [None]:
session.rollback()

In [None]:
session.commit()

In [None]:
# visualize the schema
render_er(models.Base.metadata, '../test-schema.png')

### Download and cache metadata from UniprotKB

In [None]:
# all target_names
names = [row.target_name for row in session.query(models.CrisprDesign).all()]
names = list(set(names))
len(names)

In [None]:
# url to retrieve the top hit for a given search string from human proteins in tab-delimited format
# (note the explicit list of column names)
url = (
    'https://www.uniprot.org/uniprot/?'
    'query=reviewed:yes+AND+organism:9606+AND+%s&sort=score&format=tab&limit=1&'
    'columns=id,entry name,reviewed,protein names,genes,organism,length,comment(FUNCTION),families'
)

In [None]:
def get_uniprot(name):
    response = requests.get(url % name)
    if response.text:
        return pd.read_csv(io.StringIO(response.text), sep='\t')
    else:
        print('No result for %s' % name)
        return None

In [None]:
tasks = [dask.delayed(get_uniprot)(name) for name in names]
with dask.diagnostics.ProgressBar():
    rows = dask.compute(*tasks)

In [None]:
d = pd.concat(tuple(rows), axis=0)
d.to_csv('/Users/keith.cheveralls/Downloads/2019-12-16_top-uniprotKB-hit-for-all-targets.csv')

### Insert FACS

In [None]:
db_cli.insert_facs(session, facs_results_dir='../../opencell-off-git/results/')

### Insert microscopy datasets

These are the pipeline-related ML IDs from the 'Microscopy Master Key' google sheet.

In [None]:
db_cli.insert_microscopy_datasets(session)

### Insert microscopy images

Note that there are two MLs - ML0084 and ML0108 - that are not pipeline-related acquisitions but from which there are images in the PlateMicroscopy directory (and there fore in pm.md_raw).

In [None]:
cache_dir = '../plate-microscopy-cache/20191114-ess/'
db_cli.insert_plate_microscopy_fovs(session, cache_dir=cache_dir)

### Determine how many FOVs were inserted (from the PlateMicroscopy directory only)

In [None]:
pm.md_raw.shape, len(session.query(models.MicroscopyFOV).all())

In [None]:
# FOVs for controls are not inserted
num_controls = pm.md_raw.loc[pm.md_raw.well_id.isin(['A01', 'H12'])].shape[0]
num_controls

In [None]:
# FOVs from PML0084 and PML0108 are not inserted (because these acquistions were not truly pipeline)
pml_ids = [row.pml_id for row in session.query(models.MicroscopyDataset).all()]
num_nonpipeline = pm.md_raw.loc[~pm.md_raw.pml_id.isin(pml_ids)].shape[0]
num_nonpipeline

In [None]:
# the uninserted FOVs are likely the Jin samples that are not yet in the database
# (these are mostly in Plate6 column E)
pm.md_raw.shape[0] - num_controls - num_nonpipeline, len(session.query(models.MicroscopyFOV).all())

In [None]:
fov = ops.PolyclonalLineOperations.from_plate_well(session, 'P0019', 'H11').cell_line.microscopy_fovs[0]

### FOVs from raw-pipeline-microscopy datasets

In [None]:
dataset = Session.query(models.MicroscopyDataset).filter(models.MicroscopyDataset.pml_id == 'PML0123').first()
len(dataset.fovs)

In [None]:
[r.kind for r in dataset.fovs[-1].results]

In [None]:
dataset.fovs[0].results[1].data

### Insert FOV ROIs

In [None]:
src_root = '/Users/keith.cheveralls/opencell-test/data/PlateMicroscopy/'
dst_root = '/Users/keith.cheveralls/opencell-test/output/opencell-microscopy/'

In [None]:
fovs = session.query(models.MicroscopyFOV).all()
len(fovs)

In [None]:
p = processors.FOVProcessor.from_database(fovs[0])
p.target_name

In [None]:
p.crop_corner_rois(
    '/Users/keith.cheveralls/opencell-test/data/PlateMicroscopy/', 
    '/Users/keith.cheveralls/opencell-test/output/opencell-microscopy/'
)

In [None]:
session.close()

### Insert FOV thumbnails

In [None]:
im = imageio.imread(
    '/Users/keith.cheveralls/image-data/hoechst-examples/PML0223-6h30m/MMStack_1713-G9-21_C0-PROJ-Z.tif')
im_raw = im[412:, :600]

In [None]:
imageio.imsave('/Users/keith.cheveralls/image-data/tmp2.png', im_raw[::6, ::6]*4)

In [None]:
im.shape, im.dtype

In [None]:
def b64encode_as_png(im):
    with io.BytesIO() as file:
        imageio.imsave(file, im, format='png')
        s = base64.b64encode(file.getvalue()).decode('utf-8')
    return s

In [None]:
s = b64encode_as_png(im_raw)
with io.BytesIO(base64.b64decode(s)) as file:
    im = imageio.imread(file)
plt.imshow(im)

### Insert sequencing results

In [None]:
SEQ_ROOT = '/Users/keith.cheveralls/Box/PipelineSequencing/CRISPRessoOUT_QC_Spreadsheets/'

seq_sheet_filenames = {
    1: "mNGplate1REDO_sorted_CRISPResso_QC.xlsx", 
    2: "mNGplate2REDO_CRISPResso_QC.xlsx", 
    3: "mNGplate3REDO_HC_CRISPResso_QC.xlsx", 
    4: "mNGplate4REDO_sorted_CRISPResso_QC.xlsx", 
    5: "mNGplate5_CRISPResso_QC.xlsx", 
    6: "mNGplate6_CRISPResso_QC_HC.xlsx", 
    7: "mNGplate7_sorted_1to100_CRISPResso_QC.xlsx",
    8: "mNGplate8_CRISPResso_QC.xlsx",
    9: "mNGplate9_CRISPResso_QC.xlsx",
    10: "mNGplate10_CRISPResso_QC.xlsx",
    11: "mNGplate11_CRISPResso_QC.xlsx",
    12: "mNGplate12_sorted_CRISPResso_QC.xlsx",
    13: "mNGplate13sorted_CRISPResso_QC.xlsx",
    14: "mNGplate14_CRISPResso_QC.xlsx",
    15: "mNGplate15_CRISPResso_QC.xlsx",
    16: "mNGplate16_CRISPResso_QC.xlsx",
    17: "mNGplate17_CRISPResso_QC.xlsx",
}

In [None]:
def to_float(value):
    try:
        return float(value)
    except ValueError:
        return None

In [None]:
# TODO: refactor this to use pandas to read the excel files
def read_sequencing_sheet(filepath):
    
    sheet = xlrd.open_workbook(filepath).sheet_by_index(0)
    num_rows = sheet.nrows
    num_cols = sheet.ncols
    
    # HACK: hard-coded columns corresponding to final HDR ratios (HDR/all and HDR/modified)
    WELL_ID_COLUMN_INDEX = 0
    HDR_ALL_COLUMN_INDEX = num_cols - 2
    HDR_MODIFIED_COLUMN_INDEX = num_cols - 1
    
    START_ROW_INDEX = 4
    rows = range(START_ROW_INDEX, num_rows)
    
    # well_ids
    well_ids = [sheet.cell_value(ind, WELL_ID_COLUMN_INDEX) for ind in rows]

    # overall percent HDR
    hdr_all = np.array([sheet.cell_value(ind, HDR_ALL_COLUMN_INDEX) for ind in rows])

    # percent HDR of non-unmodified sequences
    hdr_modified = np.array([sheet.cell_value(ind, HDR_MODIFIED_COLUMN_INDEX) for ind in rows])
    
    d = pd.DataFrame(
        data=list(zip(well_ids, hdr_all, hdr_modified)),
        columns=['well_id', 'hdr_all', 'hdr_modified'])
    
    # coerce to float
    d['hdr_all'] = d.hdr_all.apply(to_float)
    d['hdr_modified'] = d.hdr_modified.apply(to_float)
    
    # HACK: deal with missing/NaN values, 
    # which for some reason are loaded by xlrd as either the number 7 or 15
    d['hdr_all'] = [val if val < 1 else None for val in d.hdr_all]
    d['hdr_modified'] = [val if val < 1 else None for val in d.hdr_modified]
        
    return d

In [None]:
# load and concat all of the spreadsheets
sequencing_data = []
for plate_num, filename in seq_sheet_filenames.items():
    d = read_sequencing_sheet(os.path.join(SEQ_ROOT, filename))
    d['plate_num'] = plate_num
    sequencing_data.append(d)
sequencing_data = pd.concat(tuple(sequencing_data), axis=0)
sequencing_data.shape

In [None]:
sequencing_data.loc[sequencing_data.hdr_all.isna()].groupby('plate_num').count()

In [None]:
engine.url

In [None]:
# insert into the database
for ind, row in sequencing_data.iterrows():
    if pd.isna(row.hdr_all):
        continue
        
    plate_id = db_utils.format_plate_design_id(row.plate_num)
    well_id = db_utils.format_well_id(row.well_id)
    try:
        pcl_ops = ops.PolyclonalLineOperations.from_plate_well(Session, plate_id, well_id)
    except Exception as error:
        print(str(error))
        continue
    
    pcl_ops.insert_sequencing_dataset(Session, row[['hdr_all', 'hdr_modified']], errors='warn')

In [None]:
len(Session.query(models.SequencingDataset).all())