# OpenCell database corrections
__January 2020__<br>
__Keith Cheveralls__

This notebook documents various corrections to the opencell database that had to be manually performed. Usually, these corrections were to correct typos, update mislabeled entities, or fill in missing information. 

In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import base64
import imageio
import requests
import datetime
import numpy as np
import pandas as pd
import sqlalchemy as db

import dask.diagnostics
import sqlalchemy.orm
import sqlalchemy.ext.declarative
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 1

sys.path.append('..')
%aimport opencell.imaging
%aimport opencell.imaging.managers
%aimport opencell.imaging.processors
%aimport opencell.file_utils
%aimport opencell.database.operations

from opencell import constants, file_utils
from opencell.cli import database as db_cli
from opencell.database import models
from opencell.database import operations as ops
from opencell.database import utils as db_utils
from opencell.imaging import utils as im_utils
from opencell.imaging import images, managers, processors, viz

In [None]:
Session = None
def execute_and_commit(session, command):
    try:
        result = session.execute(command)
        session.commit()
    except Exception as exception:
        session.rollback()
        raise
    return result

In [None]:
if Session:
    Session.remove()
# url = db_utils.url_from_credentials('../db-credentials-cap.json')
url = db_utils.url_from_credentials('../db-credentials-dev.json')
url

In [None]:
engine = db.create_engine(url)
models.Base.metadata.create_all(engine)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)

### Fix target_name for SEPTs

Because of autoformating in excel, the names for SEPT targets (SEPT2, SEPT5, etc) appear as '[num]-Sep' in the spreadsheet used to populate the crispr_design table. This requires renaming of the form `'[num]-Sep'` to `'SEPT[num]'`.

Target names must be changed in two places: in the crispr_design table and in the filenames of all of the processed microscopy images (for now, this is easier than regenerating these images). 

Note that all 'children' of a cell line - the FACS dataset, sequencing results, microscopy FOVs - are inserted on `(plate_id, well_id)`, so the crispr_design table is the only place in the database where this change must be made (this is by design). 

In [None]:
# rename the target_names in the crispr_design table
for num in [2, 5, 6, 7, 8, 9, 10, 11]:
    execute_and_commit(Session, "UPDATE crispr_design SET target_name = REPLACE (target_name, '%d-Sep', 'SEPT%d');" % (num, num))

In [None]:
# processed microscopy images - conveniently, all of the SEPTs were on Plate10 
proj_filepaths = sorted(glob.glob('/Volumes/ml_group/opencell-microscopy-2/proj/czML0383-P0010/*.tif'))
crop_filepaths = sorted(glob.glob('/Volumes/ml_group/opencell-microscopy-2/crop/czML0383-P0010/*.png'))
len(proj_filepaths), len(crop_filepaths)

In [None]:
# rename processed microscopy images
for src_filepath in filepaths:
    src_filename = src_filepath.split(os.sep)[-1]
    prefix, target_name, suffix = src_filename.split('_')
    if re.match('^([1-9]{1,2}Sep)$', target_name):
        num = target_name.replace('Sep', '')
        new_target_name = 'SEPT%s' % num
        dst_filename = '%s_%s_%s' % (prefix, new_target_name, suffix)
        print('%s :: %s' % (src_filename, dst_filename))
        if False:
            os.rename(src_filepath, src_filepath.replace(src_filename, dst_filename))

### Fix the target name for (P0016, A12)
This line had no name in the original spreadsheet (instead, the template sequence was copied into the target_name column). By searching for the transcript_id, I determined that it is ATXN2L.

In [None]:
designs = Session.query(models.CrisprDesign).filter(models.CrisprDesign.plate_design_id=='P0016').filter(models.CrisprDesign.well_id=='A12').all()
design = designs[0]
len(designs)

In [None]:
design.target_name = 'ATXN2L'
Session.add(design)
Session.commit()

In [None]:
proj_filepaths = sorted(glob.glob('/Volumes/ml_group/opencell-microscopy-2/proj/czML0383-P0016/*.tif'))
crop_filepaths = sorted(glob.glob('/Volumes/ml_group/opencell-microscopy-2/crop/czML0383-P0016/*.png'))
len(proj_filepaths), len(crop_filepaths)

In [None]:
correct_atxn2l_name = 'ATXN2L'
wrong_atxn2l_name = 'GCACCACGGCGCTCGGGCAGTGCTTCTCGATGTAGTCCTGGAAGCCCTGCACGCCACCACTTCCTGGACCTTGAAACAAAACTTCCAATCCGCCACCCATCATATCGGTAAAGGCCTTTTGCCACTCCTTGAAGTTGAGCTCGGTCATGGCGGCGGCGGGGGCGCGGGCGCGGGTGCGGGCGGGGG'

In [None]:
# rename processed microscopy images
for src_filepath in proj_filepaths:
    src_filename = src_filepath.split(os.sep)[-1]
    prefix, target_name, suffix = src_filename.split('_')
    if target_name == wrong_atxn2l_name:
        new_target_name = correct_atxn2l_name
        dst_filename = '%s_%s_%s' % (prefix, new_target_name, suffix)
        print('%s :: %s' % (src_filename, dst_filename))
        if True:
            os.rename(src_filepath, src_filepath.replace(src_filename, dst_filename))

### Editing annotation categories

This was originally to add the 're_image' flag to all annotations that included either or both of the flags 'confluency_off', 'over_exposed', or 'disk_artifact'. 

In [None]:
def build_url(cell_line_id):
    # return f'http://localhost:5000/annotations/{cell_line_id}'
    return f'http://cap.czbiohub.org:5001/annotations/{cell_line_id}'

In [None]:
# the legacy FOV-related QC categories that should be condensed into the 're_image' category
fov_qc_categories = ['over_exposed', 'disk_artifact', 'confluency_off']

In [None]:
# all cell lines with annotations
ants = Session.query(models.CellLineAnnotation).all()
cell_line_ids = [ant.cell_line_id for ant in ants]
len(cell_line_ids)

In [None]:
overwrite = False
for cell_line_id in cell_line_ids:
    url = build_url(cell_line_id)
    result = requests.get(url)
    data = result.json()
    
    
    if data.get('categories') and set(fov_qc_categories).intersection(data['categories']):
        print('Updating cell_line %s (categories %s)' % (cell_line_id, data.get('categories')))
        
        # add the 're_image' flag to the list of categories
        data['categories'] = list(set(data['categories'] + ['re_image']))
        
        # push the new annotation JSON to the database
        if overwrite:
            r = requests.put(
                url, 
                data=json.dumps(data), 
                headers={'Content-Type': 'application/json'})
            if r.status_code != 200:
                print('Error updating %s' % cell_line_id)

In [None]:
url = build_url(cell_line_ids[111])

In [None]:
result = requests.get(url)
data = result.json()
data

In [None]:
r = requests.put(
    url, 
    data=json.dumps(data), 
    headers={'Content-Type': 'application/json'})

In [None]:
r.status_code