In [2]:
import re
import csv
from covid_19.preprocess import get_metadata_dict, get_zip_texts_for_entry, get_metadata_df
from indra_db import get_primary_db

In [3]:
covid_docs_file = '../covid_docs_ranked_corona.csv'
covid_pmids = set()
with open(covid_docs_file, 'rt') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        pmid = row[4]
        covid_pmids.add(pmid)

In [4]:
len(covid_pmids)

3212

In [5]:
md = get_metadata_dict()

INFO: [2020-05-18 17:05:38] numexpr.utils - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO: [2020-05-18 17:05:38] numexpr.utils - NumExpr defaulting to 8 threads.


In [6]:
aa_reg = '[ACDEFGHIKLMNPQRSTVWY]'
mut_reg = '\s+' + aa_reg + '\d+' + aa_reg + '\s+'

In [9]:
aa_short = ['ala', 'arg', 'asn', 'asp', 'cys', 'gln', 'glu', 'gly',
            'his', 'ile', 'leu', 'lys', 'met', 'phe', 'pro', 'ser',
            'thr', 'trp', 'tyr', 'val']
aa_short_reg = '|'.join([aa for aa in aa_short])
aa_seq_reg = '(?:%s)\d{2,5}' % aa_short_reg

In [None]:
"""
spr_docs = []
for md_entry in md:
    texts = get_zip_texts_for_entry(md_entry, zip=False)
    for _, text_type, text in texts:
        if re.search('\splasmon\s', text):
            spr_docs.append(md_entry)
"""

In [None]:
ignore_list = (
    'Y2H', # Yeast two-hybrid
    'C3H', # Mouse strain
    'D980R', # HeLa cell strain
    'E3L', # vaccinia virus E3L
    'S1P', # Sphingosine-1-phosphate
    'Q7R', # quercetin 7-rhamnoside
    'S6K', # S6 kinase
)
by_mut = {}
by_doc = {}
for ix, md_entry in enumerate(md):
    pmid = md_entry['pubmed_id']
    title = md_entry['title']
    if pmid not in covid_pmids:
        continue
    texts = get_zip_texts_for_entry(md_entry, zip=False)
    cord_uid = md_entry['cord_uid']
    for _, text_type, text in texts:
        matches = re.findall(mut_reg, text, flags=re.IGNORECASE)
        for match in matches:
            ms = match.strip()
            if re.match('H\dN', ms) or re.match('S\d[ABCDEG]', ms) or ms in ignore_list:
                continue
            if ms not in by_mut:
                by_mut[ms] = set([(title, pmid)])
            else:
                by_mut[ms].add((title, pmid))
            if (title, pmid) not in by_doc:
                by_doc[(title, pmid)] = set([ms])
            else:
                by_doc[(title, pmid)].add(ms)

In [None]:
docs = sorted([(k, list(v)) for k, v in by_doc.items()],
               key=lambda x: len(x[1]), reverse=True)
muts = sorted([(k, list(v)) for k, v in by_mut.items()],
               key=lambda x: len(x[1]), reverse=True)

In [None]:
def dump_docs(docs_sorted):
    docs_rows = [['title', 'pmid', 'pmid_link', 'mutation_count', 'mutation']]
    for (title, pmid), muts in docs_sorted:
        pmid_link = f'https://www.ncbi.nlm.nih.gov/pubmed/{pmid}'
        count = len(muts)
        for mut in muts:
            docs_rows.append([title, pmid, pmid_link, count, mut])
    with open('docs_ranked_by_muts.csv', 'wt') as f:
        csvwriter = csv.writer(f, delimiter=',')
        csvwriter.writerows(docs_rows)
        
def dump_muts(muts_sorted):
    muts_rows = [['mutation', 'doc_count', 'title', 'pmid', 'pmid_link']]
    for mut, docs in muts_sorted:
        count = len(docs)
        for title, pmid in docs:
            pmid_link = f'https://www.ncbi.nlm.nih.gov/pubmed/{pmid}'
            muts_rows.append([mut, count, title, pmid, pmid_link])
    with open('muts_ranked_by_docs.csv', 'wt') as f:
        csvwriter = csv.writer(f, delimiter=',')
        csvwriter.writerows(muts_rows)
        
dump_docs(docs)
dump_muts(muts)

In [None]:
muts[0]

In [None]:
db = get_primary_db()

In [None]:
from indra_db import client

In [None]:
from indra.sources import indra_db_rest as idr

In [None]:
idrp = idr.get_statements_for_paper([('pmid', _47)])

In [None]:
idrp.statements

In [None]:
idrp.statements[0].evidence[0].text