In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import xlrd
import base64
import imageio
import requests
import datetime
import numpy as np
import pandas as pd

import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative

%load_ext autoreload
%autoreload 1

sys.path.append('../..')
from opencell import constants, file_utils
from opencell.database import models
from opencell.database import metadata_operations
from opencell.database import utils as db_utils

In [None]:
url = db_utils.url_from_credentials('../../db-credentials-dev.json')
url = db_utils.url_from_credentials('../../db-credentials-cap.json')
url

In [None]:
engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)
session = Session()

In [None]:
def load_sheet(filepath, sheet_name=None):
    '''
    Load an excel sheet of sequencing results
    '''
    # note the first three rows are always column names
    sheet = pd.read_excel(
        os.path.join(filepath), header=[0, 1, 2], sheet_name=(sheet_name or 0)
    )
    # 'Missing' is always used for NAs in all sheets
    sheet.replace(to_replace='Missing', value=np.nan, inplace=True)
    return sheet

### Plates 1-19

In [None]:
# this is the multi-index column in which the well_ids appear
# (this looks wrong but is empirically determined)
canonical_well_id_column = ('enrichment', 'read-trimming', 'repair type')

In [None]:
# the names of the columns containing the sequence counts by category
# (these are found within the ('sorted', 'post-filter', :) multi-index block)
data_columns = ['Unmodified', 'NHEJ', 'HDR', 'MIXED']

In [None]:
seq_results_dirpath = (
    '/Users/keith.cheveralls/Box/PipelineSequencing/CRISPRessoOUT_QC_Spreadsheets/'
)

# filenames indexed by plate number
seq_results_filenames = {
    'P0001': "mNGplate1REDO_sorted_CRISPResso_QC.xlsx", 
    'P0002': "mNGplate2REDO_Rerun_CRISPResso_QC.xlsx", 
    'P0003': "mNGplate3REDO_Rerun_CRISPResso_QC.xlsx", 
    'P0004': "mNGplate4REDO_sorted_CRISPResso_QC.xlsx", 
    'P0005': "mNGplate5_CRISPResso_QC.xlsx", 
    'P0006': "mNGplate6_Rerun_CRISPResso_QC.xlsx", 
    'P0007': "mNGplate7_sorted_1to100_CRISPResso_QC.xlsx",
    'P0008': "mNGplate8_Rerun_CRISPResso_QC.xlsx",
    'P0009': "mNGplate9_Rerun_CRISPResso_QC.xlsx",
    'P0010': "mNGplate10_Rerun_CRISPResso_QC.xlsx",
    'P0011': "mNGplate11_Rerun_CRISPResso_QC.xlsx",
    'P0012': "mNGplate12_Rerun_CRISPResso_QC.xlsx",
    'P0013': "mNGplate13sorted_CRISPResso_QC.xlsx",
    'P0014': "mNGplate14_Rerun_CRISPResso_QC.xlsx",
    'P0015': "mNGplate15_Rerun_CRISPResso_QC.xlsx",
    'P0016': "mNGplate16_Rerun_CRISPResso_QC.xlsx",
    'P0017': "mNGplate17_CRISPResso_QC.xlsx",
    'P0018': 'mNGplate18_Rerun_CRISPResso_QC.xlsx',
    'P0019': 'mNGplate19_Rerun_CRISPResso_QC.xlsx',
}

In [None]:
def parse_plates_1thru19(sheet, plate_id):
    '''
    Load the sequencing spreadsheet for plates 1-19 (these are all in the same format)
    '''
    # the column of well_ids
    well_ids = sheet[canonical_well_id_column]
    well_ids.name = 'well_id'
    
    # the columns of data we need
    data = sheet[('sorted', 'post-filter')][data_columns]

    df_out = pd.merge(well_ids, data, left_index=True, right_index=True)
    df_out['plate_id'] = plate_id

    return df_out

In [None]:
# load and concatenate the data for plates 1-19
sheets = []
for plate_id, filename in seq_results_filenames.items():
    filepath = os.path.join(seq_results_dirpath, filename)
    sheet = parse_plates_1thru19(load_sheet(filepath), plate_id)
    sheets.append(sheet)

all_plates = pd.concat(tuple(sheets), axis=0)
all_plates.shape

In [None]:
all_plates.plate_num.value_counts()

### Plate21

In [None]:
# this is a giant google sheet with sequencing results for resorts, plate21, and plate22
# that James shared with me on 2021-02-02
filepath = '/Users/keith.cheveralls/Downloads/missing_completed_CRISPResso_analysis.xlsx'

In [None]:
# this is the plate21 sheet
plate21 = load_sheet(filepath, sheet_name='CZBSeqLib003689')

In [None]:
# status is 'sorted' or 'resort'; drop the resorts
plate21 = plate21.loc[plate21[('enrichment', 'read-trimming', 'status')] == 'sorted']

# now the sheet can be parsed as usual
plate21 = parse_plates_1thru19(plate21, plate_id='P0021')

### Plate22

In [None]:
plate22 = load_sheet(filepath, sheet_name='CZBSeqLib004062')

In [None]:
# parse the well_ids from the unnamed column they appear in,
# and move them into the column in which they appear in the sheets for plates 1-19
plate22[canonical_well_id_column] = (
    plate22['enrichment', 'read-trimming', 'mNG name.1'].apply(lambda s: s.split('_')[1])
)

In [None]:
# now the sheet can be parsed as usual
plate22 = parse_plates_1thru19(plate22, plate_id='P0022')

In [None]:
all_plates = pd.concat((all_plates, plate21, plate22), axis=0)

In [None]:
all_plates.shape

### Insert the sequencing results for plates 1-22

This is for the original sorts (with `sort_count=1`) only.

In [None]:
# use lower case for the data column names 
# (which are natively 'Unmodified', 'NHEJ', 'HDR', 'MIXED')
data_columns = [col.lower() for col in data_columns]

all_plates.rename(
    columns={column: column.lower() for column in all_plates.columns},
    inplace=True
)

In [None]:
# delete all of the existing sequencing data
engine.execute('delete from sequencing_dataset;')

In [None]:
Session.rollback()

In [None]:
for ind, row in all_plates.iterrows():
    
    total = row[data_columns].sum()
    if row[data_columns].isna().any() or total == 0:
        print(
            'Some NAs or all zeros for sequencing counts from plate %s well %s' 
            % (row.plate_id, row.well_id)
        )
        continue
    
    well_id = db_utils.format_well_id(row.well_id)
    ops = metadata_operations.PolyclonalLineOperations.from_plate_well(
        Session, plate_id, well_id, sort_count=1
    )
    
    if not ops:
        print('No cell line found for plate %s well %s' % (row.plate_id, row.well_id))
        continue
    
    # insert the counts for each category as percentages
    # (the categories are 'unmodified', 'nhej', 'hdr', 'mixed')
    ops.insert_sequencing_dataset(Session, row[data_columns]/total)

In [None]:
len(Session.query(models.SequencingDataset).all())