In [None]:
import os
import re
import sys
import enum
import json
import datetime

import numpy as np
import pandas as pd
# from eralchemy import render_er

import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
sys.path.append('..')
from opencell import constants, file_utils
from opencell.database import operations as ops
from opencell.database import models, populate, utils
from opencell.imaging import image, plate_microscopy_api, utils, viz

In [None]:
set(map(
    utils.format_plate_design_id,
    [123, 'plate123', 'Plate 123', 'P0123']))

In [None]:
# docker test db
url = utils.url_from_credentials('../test_credentials.json')

In [None]:
url = 'postgresql://postgres:password@cap.czbiohub.org:5433/pipeline_db'

## Creating and populating the database

In [None]:
# manually drop and create the schema
engine = db.create_engine(url)

im_sure = True
if im_sure:
    print('Dropping all tables')
    models.Base.metadata.drop_all(engine)

print('Creating all tables')
models.Base.metadata.create_all(engine)

In [None]:
# drop, create, and populate the database
# TODO: fewer and less verbose warnings when drop_all=False
im_sure = True
if im_sure:
    populate.populate(url, drop_all=False, errors='ignore')

In [None]:
# visualize the schema
render_er(models.Base.metadata, '../test-schema.png')

## Accessing the database

In [None]:
engine = db.create_engine(url)
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [None]:
session.query(models.MasterCellLine).all()

In [None]:
session.rollback()

In [None]:
# count the number of cell lines
lines = session.query(models.CellLine).all()
len(lines)

In [None]:
session.close()

## Inserting FACS results

In [None]:
# load the cached FACS results
facs_properties = pd.read_csv('../results/2019-07-16_all-facs-results.csv')
with open('../../opencell-off-git/results/2019-07-16_all-dists.json', 'r') as file:
    facs_histograms = json.load(file)

In [None]:
# key the histograms by (plate_id, well_id)
d = {}
for row in facs_histograms:
    d[(row['plate_id'], row['well_id'])] = row
facs_histograms = d

In [None]:
facs_properties.head()

In [None]:
plate_id, well_id = 'P0019', 'A11'
pcl_ops = ops.PolyclonalLineOperations.from_plate_well(session, plate_id, well_id)
pcl_ops.cell_line

In [None]:
for ind, facs_row in facs_properties.iterrows():
    plate_id = facs_row.plate_id
    well_id = utils.format_well_id(facs_row.well_id)

    # the polyclonal line
    try:
        pcl_ops = ops.PolyclonalLineOperations.from_plate_well(session, plate_id, well_id)
    except ValueError as error:
        print(error)
        continue

    # the histograms (dict of 'x', 'y_sample', 'y_fitted_ref')
    # note: keyed by unformatted well_id
    histograms = facs_histograms.get((facs_row.plate_id, facs_row.well_id))

    row = row.drop(['plate_id', 'well_id'])
    pcl_ops.insert_facs_results(session, histograms, facs_row, errors='ignore')

In [None]:
f = session.query(models.FACSResults).all()

In [None]:
line = ops.PolyclonalLineOperations.from_plate_well(session, 'P0005', 'G11').cell_line

In [None]:
d = line.facs_results[0].as_dict()
_ = d.pop('histograms')
d

## Inserting microscopy datasets

These are the pipeline-related ML IDs from the 'Microscopy Master Key' google sheet.

In [None]:
exp_md = file_utils.load_microscopy_master_key()

In [None]:
# TODO: loop over rows of exp_md and insert each dataset
dataset = models.MicroscopyDataset(
    pml_id=pml_id, 
    date=date, 
    user=user, 
    description=description,
    root_directory='plate_microscopy')

## Inserting sequencing results

In [None]:
SEQ_ROOT = '/Users/keith.cheveralls/Box-cache/PipelineSequencing/CRISPRessoOUT_QC_Spreadsheets/'

seq_sheet_filenames = {
    1: "mNGplate1REDO_sorted_CRISPResso_QC.xlsx", 
    2: "mNGplate2REDO_CRISPResso_QC.xlsx", 
    3: "mNGplate3REDO_HC_CRISPResso_QC.xlsx", 
    4: "mNGplate4REDO_sorted_CRISPResso_QC.xlsx", 
    5: "mNGplate5_CRISPResso_QC.xlsx", 
    6: "mNGplate6_CRISPResso_QC_HC.xlsx", 
    7: "mNGplate7_sorted_1to100_CRISPResso_QC.xlsx",
    8: "mNGplate8_CRISPResso_QC.xlsx",
    9: "mNGplate9_CRISPResso_QC.xlsx",
    10: "mNGplate10_CRISPResso_QC.xlsx",
    11: "mNGplate11_CRISPResso_QC.xlsx",
}

In [None]:
def to_float(value):
    try:
        return float(value)
    except ValueError:
        return None

In [None]:
# TODO: refactor this to use pandas to read the excel files
def read_sequencing_sheet(filepath):
    
    sheet = xlrd.open_workbook(filepath).sheet_by_index(0)
    num_rows = sheet.nrows
    num_cols = sheet.ncols
    
    # HACK: hard-coded columns corresponding to final HDR ratios (HDR/all and HDR/modified)
    WELL_ID_COLUMN_INDEX = 0
    HDR_ALL_COLUMN_INDEX = num_cols - 2
    HDR_MODIFIED_COLUMN_INDEX = num_cols - 1
    
    rows = range(num_rows)
    START_ROW_INDEX = 3
    
    # well_ids
    well_ids = [
        sheet.cell_value(ind, WELL_ID_COLUMN_INDEX) for ind in rows if ind > START_ROW_INDEX]

    # overall percent HDR
    hdr_all = np.array([
        sheet.cell_value(ind, HDR_ALL_COLUMN_INDEX) for ind in rows if ind > START_ROW_INDEX])

    # percent HDR of non-unmodified sequences
    hdr_modified = np.array([
        sheet.cell_value(ind, HDR_MODIFIED_COLUMN_INDEX) for ind in rows if ind > START_ROW_INDEX])
    
    d = pd.DataFrame(
        data=list(zip(well_ids, hdr_all, hdr_modified)),
        columns=['well_id', 'hdr_all', 'hdr_modified'])
    
    # coerce to float
    d['hdr_all'] = d.hdr_all.apply(to_float)
    d['hdr_modified'] = d.hdr_modified.apply(to_float)
    
    # HACK: deal with missing/NaN values, 
    # which for some reason are loaded by xlrd as either the number 7 or 15
    d['hdr_all'] = [val if val < 1 else None for val in d.hdr_all]
    d['hdr_modified'] = [val if val < 1 else None for val in d.hdr_modified]
        
    return d

In [None]:
# load and concat all of the spreadsheets
sequencing_data = []
for plate_num in range(1, 12):
    d = read_sequencing_sheet(os.path.join(SEQ_ROOT, seq_sheet_filenames[plate_num]))
    d['plate_num'] = plate_num
    sequencing_data.append(d)
    
sequencing_data = pd.concat(tuple(sequencing_data), axis=0)

In [None]:
# insert into the database
for ind, row in sequencing_data.iterrows():
    
    plate_id = utils.format_plate_design_id(row.plate_num)
    well_id = utils.format_well_id(row.well_id)

    # retrieve the polyclonal line for this plate_id and well_id
    try:
        pcl_ops = ops.PolyclonalLineOperations.from_plate_well(session, plate_id, well_id)
    except ValueError as error:
        print(error)
        continue

    row = row.drop(['plate_num', 'well_id'])
    pcl_ops.insert_sequencing_results(session, row, errors='ignore')