In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import xlrd
import base64
import imageio
import requests
import datetime
import numpy as np
import pandas as pd

import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative

%load_ext autoreload
%autoreload 1

sys.path.append('..')
from opencell import constants, file_utils
from opencell.database import models
from opencell.database import operations as ops
from opencell.database import utils as db_utils

In [None]:
url = db_utils.url_from_credentials('../db-credentials-dev.json')
url = db_utils.url_from_credentials('../db-credentials-cap.json')
url

In [None]:
engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)
session = Session()

### Insert sequencing results

In [None]:
SEQ_ROOT = '/Users/keith.cheveralls/Box/PipelineSequencing/CRISPRessoOUT_QC_Spreadsheets/'

seq_sheet_filenames = {
    1: "mNGplate1REDO_sorted_CRISPResso_QC.xlsx", 
    2: "mNGplate2REDO_CRISPResso_QC.xlsx", 
    3: "mNGplate3REDO_HC_CRISPResso_QC.xlsx", 
    4: "mNGplate4REDO_sorted_CRISPResso_QC.xlsx", 
    5: "mNGplate5_CRISPResso_QC.xlsx", 
    6: "mNGplate6_CRISPResso_QC_HC.xlsx", 
    7: "mNGplate7_sorted_1to100_CRISPResso_QC.xlsx",
    8: "mNGplate8_CRISPResso_QC.xlsx",
    9: "mNGplate9_CRISPResso_QC.xlsx",
    10: "mNGplate10_CRISPResso_QC.xlsx",
    11: "mNGplate11_CRISPResso_QC.xlsx",
    12: "mNGplate12_sorted_CRISPResso_QC.xlsx",
    13: "mNGplate13sorted_CRISPResso_QC.xlsx",
    14: "mNGplate14_CRISPResso_QC.xlsx",
    15: "mNGplate15_CRISPResso_QC.xlsx",
    16: "mNGplate16_CRISPResso_QC.xlsx",
    17: "mNGplate17_CRISPResso_QC.xlsx",
}

In [None]:
def to_float(value):
    try:
        return float(value)
    except ValueError:
        return None

In [None]:
# TODO: refactor this to use pandas to read the excel files
def read_sequencing_sheet(filepath):
    
    sheet = xlrd.open_workbook(filepath).sheet_by_index(0)
    num_rows = sheet.nrows
    num_cols = sheet.ncols
    
    # HACK: hard-coded columns corresponding to final HDR ratios (HDR/all and HDR/modified)
    WELL_ID_COLUMN_INDEX = 0
    HDR_ALL_COLUMN_INDEX = num_cols - 2
    HDR_MODIFIED_COLUMN_INDEX = num_cols - 1
    
    START_ROW_INDEX = 4
    rows = range(START_ROW_INDEX, num_rows)
    
    # well_ids
    well_ids = [sheet.cell_value(ind, WELL_ID_COLUMN_INDEX) for ind in rows]

    # overall percent HDR
    hdr_all = np.array([sheet.cell_value(ind, HDR_ALL_COLUMN_INDEX) for ind in rows])

    # percent HDR of non-unmodified sequences
    hdr_modified = np.array([sheet.cell_value(ind, HDR_MODIFIED_COLUMN_INDEX) for ind in rows])
    
    d = pd.DataFrame(
        data=list(zip(well_ids, hdr_all, hdr_modified)),
        columns=['well_id', 'hdr_all', 'hdr_modified'])
    
    # coerce to float
    d['hdr_all'] = d.hdr_all.apply(to_float)
    d['hdr_modified'] = d.hdr_modified.apply(to_float)
    
    # HACK: deal with missing/NaN values, 
    # which for some reason are loaded by xlrd as either the number 7 or 15
    d['hdr_all'] = [val if val < 1 else None for val in d.hdr_all]
    d['hdr_modified'] = [val if val < 1 else None for val in d.hdr_modified]
        
    return d

In [None]:
# load and concat all of the spreadsheets
sequencing_data = []
for plate_num, filename in seq_sheet_filenames.items():
    d = read_sequencing_sheet(os.path.join(SEQ_ROOT, filename))
    d['plate_num'] = plate_num
    sequencing_data.append(d)
sequencing_data = pd.concat(tuple(sequencing_data), axis=0)
sequencing_data.shape

In [None]:
sequencing_data.loc[sequencing_data.hdr_all.isna()].groupby('plate_num').count()

In [None]:
engine.url

In [None]:
# insert into the database
for ind, row in sequencing_data.iterrows():
    if pd.isna(row.hdr_all):
        continue
        
    plate_id = db_utils.format_plate_design_id(row.plate_num)
    well_id = db_utils.format_well_id(row.well_id)
    try:
        pcl_ops = ops.PolyclonalLineOperations.from_plate_well(Session, plate_id, well_id)
    except Exception as error:
        print(str(error))
        continue
    
    pcl_ops.insert_sequencing_dataset(Session, row[['hdr_all', 'hdr_modified']], errors='warn')

In [None]:
len(Session.query(models.SequencingDataset).all())