In [12]:
import os, sys, json

import numpy as np
import pandas as pd

from bionlp.util import fs, io, func
from bionlp.spider import pubtator

sys.path.append(os.path.join(os.path.expanduser('~'), 'source', 'py', 'evntextrc', 'bin'))
import bnlpst

DATA_PATH = os.path.join(os.path.expanduser('~'), 'data', 'bioevent', 'bnlpst')
YEAR = '2011'
TASK = 'bgi'
TRAIN_DATA_PATH = os.path.join(DATA_PATH, YEAR, TASK, 'train')
DEV_DATA_PATH = os.path.join(DATA_PATH, YEAR, TASK, 'dev')
TEST_DATA_PATH = os.path.join(DATA_PATH, YEAR, TASK, 'test')

In [20]:
# Use PubTator to obtain the annotated concepts from the related PubMed data
pubtator_cli = pubtator.PubTatorAPI()

entity_by_type, concept_map, from_pmid = {}, {}, {}

for ds_path in [TRAIN_DATA_PATH, DEV_DATA_PATH]:
    for fpath in fs.listf(ds_path, pattern='.*.a1', full_path=True):
        prgrphs = pubtator_cli.get_concepts_pmid('all', fpath.split('-')[1])
        concepts = func.flatten_list([[(prgrph['text'][concept['span']['begin']:concept['span']['end']], concept['obj']) for concept in prgrph['denotations'] if concept['obj']] for prgrph in prgrphs if len(prgrph['denotations'])>0])
        concept_map.update(dict(concepts))
        sent_bndry, words, annots, gddfs, coref = bnlpst.get_a1_2011(fpath, disable=['parse'])
        for ent_type, ent_str in zip(annots['type'], annots['str']):
            entity_by_type.setdefault(ent_type, set([])).add(ent_str.lower())
            from_pmid.setdefault(ent_str.lower(), []).append(fpath.split('-')[1])
entity_by_type = dict([(k, list(v)) for k, v in entity_by_type.items()])

In [28]:
# Append the PMIDs
entity_concept_by_type = dict([(k, [(s, [c for w, c in concept_map.items() if w in s], from_pmid.setdefault(s, [])) for s in v]) for k, v in entity_by_type.items()])

In [14]:
concept_map

{u'116-124': u'Chemical:C448907',
 u'13-amino-acid': u'Chemical:',
 u'14-amino-acid': u'Chemical:',
 u'2-keto-3-deoxygluconate': u'Chemical:C002957',
 u'2-keto-3-deoxygluconate-6-phosphate': u'Chemical:',
 u'267:12055-12060': u'Chemical:C024336',
 u'A/T': u'Mutation:c|SUB|A||T',
 u'A863G': u'Mutation:c|SUB|A|863|G',
 u'ATP': u'Chemical:D000255',
 u'Alanine': u'Chemical:CHEBI:16449',
 u'Amino acid': u'Chemical:CHEBI:33704',
 u'Arginine': u'Chemical:CHEBI:29016',
 u'Asp': u'Chemical:CHEBI:22660',
 u'B. subtilis': u'Species:1423',
 u'B. subtilis 168': u'Species:224308',
 u'B. subtilis strain': u'Species:1423',
 u'Bacillus subtilis': u'Species:1423',
 u'Bacillus subtilis 168': u'Species:224308',
 u'Bacillus subtilis strain 168': u'Species:224308',
 u'Bacteriol': u'Chemical:',
 u'C': u'Chemical:',
 u'C. acetobutylicum': u'Species:1488',
 u'C. perfringens': u'Species:1502',
 u'Ca': u'Chemical:',
 u'Clostridium': u'Species:1488',
 u'Clostridium acetobutylicum': u'Species:1488',
 u'Clostridium

In [29]:
entity_concept_by_type

{'Action': [('control', [], ['9004507']),
  ('co-transcribed', [], ['9987136']),
  ('presence', [], ['10234829']),
  ('low concentration', [], ['10788508', '10788508']),
  ('stress-induced increase', [], ['9696771']),
  ('concentration', [], ['10197994', '10411757']),
  ('activity',
   [],
   ['10323866',
    '10383978',
    '10503549',
    '11069677',
    '1575712',
    '1938874',
    '9696775',
    '9852014',
    '10411757']),
  ('rising level', [], ['10075739']),
  ('its', [], ['10197994', '10400595', '10468601', '10468601']),
  ('capacity', [], ['10323866']),
  ('that', [], ['10197994']),
  ('depletion', [], ['9852014']),
  ('absence', [], ['11069677']),
  ('production',
   [],
   ['10075739', '10383978', '10464187', '11069677', '9696775']),
  ('proper localization', [], ['9922240']),
  ('synthesis', [], ['1902213']),
  ('localization', [], ['10747015']),
  ('activation', [], ['7883171', '8631668']),
  ('processing', [], ['7883171']),
  ('proteolytic processing', [], ['1575712']),


In [6]:
# Combined the annotated text to query; Failed :(
# res = pubtator_cli.get_concepts_rawtxtlist('all', [' '.join(words) for words in entity_by_type.values()])
# for ent_type, ent_strs in entity_by_type.iteritems():
#     entity_by_type[ent_type] = [(ent_str, [concept['obj'] for concept in pubtator_cli.get_concepts_rawtxt('gene', ent_str)['denotations']]) for ent_str in ent_strs]

In [25]:
io.write_json(json.dumps(entity_concept_by_type), 'entity_concept_by_type.json')
io.write_obj(entity_concept_by_type, 'entity_concept_by_type.pkl')

In [None]:
# Test1
text = 'A kinetic model identifies phosphorylated estrogen receptor-a (ERa) as a critical regulator of ERa dynamics in breast cancer.'
pubtator_cli.get_concepts_rawtxtlist('chemical', text.split())

In [None]:
# Test2
pubtator_cli.get_concepts_pmid('chemical', '19894120')