In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import requests
import datetime
import numpy as np
import pandas as pd

import dask.diagnostics
import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 1

from opencell.database import models
from opencell.database import operations as ops
from opencell.database import utils as db_utils

In [None]:
url = db_utils.url_from_credentials('../db-credentials-dev.json')
url = db_utils.url_from_credentials('../db-credentials-cap.json')
url

In [None]:
engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)
session = Session()

### Download and cache metadata from UniprotKB

In [None]:
# all target_names
names = [row.target_name for row in session.query(models.CrisprDesign).all()]
names = list(set(names))
len(names)

In [None]:
# url to retrieve the top hit for a given search string from human proteins in tab-delimited format
# (note the explicit list of column names)
url = (
    'https://www.uniprot.org/uniprot/?'
    'query=reviewed:yes+AND+organism:9606+AND+%s&sort=score&format=tab&limit=1&'
    'columns=id,entry name,reviewed,protein names,genes,organism,length,comment(FUNCTION),families'
)

In [None]:
def get_uniprot(name):
    response = requests.get(url % name)
    if response.text:
        return pd.read_csv(io.StringIO(response.text), sep='\t')
    else:
        print('No result for %s' % name)
        return None

In [None]:
tasks = [dask.delayed(get_uniprot)(name) for name in names]
with dask.diagnostics.ProgressBar():
    rows = dask.compute(*tasks)

In [None]:
d = pd.concat(tuple(rows), axis=0)
d.to_csv('/Users/keith.cheveralls/Downloads/2019-12-16_top-uniprotKB-hit-for-all-targets.csv')