In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import requests
import datetime
import numpy as np
import pandas as pd

import dask.diagnostics
import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative
from matplotlib import pyplot as plt

sys.path.append('../..')

%load_ext autoreload
%autoreload 1
%aimport opencell.database.uniprot_utils

from opencell.database import models, uniprot_utils
from opencell.database import operations as ops
from opencell.database import utils as db_utils

In [None]:
url = db_utils.url_from_credentials('../../db-credentials-dev.json')
url = db_utils.url_from_credentials('../../db-credentials-cap.json')
url

In [None]:
engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)

### Download and cache metadata from UniprotKB

In [None]:
# all target_names
designs = Session.query(models.CrisprDesign).all()
len(designs)

In [None]:
uniprot_utils.get_uniprot_metadata(gene_name=designs[111].target_name)

In [None]:
uniprot_utils.query_uniprotkb('CAD', limit=2)

In [None]:
# retrieve and concatenate metadata for all crispr designs
tasks = [
    dask.delayed(uniprot_utils.query_uniprotkb)(design.transcript_id, limit=1) 
    for design in designs
]

with dask.diagnostics.ProgressBar():
    rows = dask.compute(*tasks)

metadata = pd.concat(tuple(rows), axis=0)

In [None]:
metadata.to_csv('/Users/keith.cheveralls/Downloads/2020-05-21_top-uniprotKB-hit-for-all-targets.csv')

In [None]:
# check for rows without a uniprot_id
metadata.loc[metadata.uniprot_id.isna()]

### Developing regex to find the primary name from the `protein_names` column

In [None]:
md = pd.read_sql(
    '''select * from crispr_design inner join uniprot_metadata using (uniprot_id)''',
    engine
)

In [None]:
# BUD23 is a troubling edge case in which the primary name contains
# a pair of parentheses that should be retained
row = md.loc[md.target_name == 'BUD23'].iloc[0]
row.protein_names

In [None]:
uniprot_utils.prettify_uniprot_protein_name(row.protein_names)

In [None]:
result = re.match(r'^(.*?) (\(.*?\) )', row.protein_names)
result.groups()

In [None]:
# test the pattern on all protein_names
for val in md.protein_names:
    result = re.match(r'^(.*?)(?: \(.*?\))*$', val)
    if result is None:
        print(val)
    print(result.groups()[0])

In [95]:
for gene_names in md.gene_names:
    gene_names.split(' ')