In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.ground import gilda_ground

In [2]:
MIP_stmts = get_stmts_with_agent_text_like('MIP')['MIP']

ids, content = get_text_content_from_stmt_ids(MIP_stmts)

texts = [universal_extract_text(text, contains=['MIP']) for text in content.values() if text]

In [3]:
len(texts)

530

In [4]:
miner = AdeftMiner('MIP')
miner.process_texts(texts)

In [5]:
top = miner.top(40)

In [6]:
top

[('molecularly imprinted polymer', 91.88),
 ('protein', 56.18),
 ('macrophage inflammatory protein', 54.31034482758619),
 ('maximal inspiratory pressure', 38.82051282051282),
 ('a molecularly imprinted polymer', 27.0),
 ('maximum intensity projection', 20.363636363636363),
 ('mycobacterium indicus pranii', 18.85714285714286),
 ('pressure', 17.93333333333333),
 ('major intrinsic protein', 17.88888888888889),
 ('peptide', 17.846153846153847),
 ('inspiratory pressure', 14.759999999999998),
 ('and macrophage inflammatory protein', 12.933333333333332),
 ('minimally invasive parathyroidectomy', 11.8),
 ('molecularly inversion probe', 11.0),
 ('of molecularly imprinted polymer', 9.636363636363637),
 ('malaria in pregnancy', 8.666666666666666),
 ('maximum inspiratory pressure', 7.777777777777778),
 ('myoinhibitory peptide', 7.555555555555555),
 ('intrinsic protein', 6.719999999999999),
 ('inhibitory peptide', 6.181818181818182),
 ('a new molecularly imprinted polymer', 6.0),
 ('myd88 inhibitor

In [7]:
longforms = miner.get_longforms(cutoff=5)

In [8]:
longforms

[('molecularly imprinted polymer', 91.88),
 ('protein', 56.18),
 ('maximal inspiratory pressure', 38.82051282051282),
 ('maximum intensity projection', 20.363636363636363),
 ('mycobacterium indicus pranii', 18.85714285714286),
 ('peptide', 17.846153846153847),
 ('minimally invasive parathyroidectomy', 11.8),
 ('molecularly inversion probe', 11.0),
 ('malaria in pregnancy', 8.666666666666666)]

In [9]:
del longforms[1]

del longforms[4]

longforms.extend([top[2], top[8], top[17], top[21]])

longforms.sort(key=lambda x: -x[1])

In [10]:
longforms

[('molecularly imprinted polymer', 91.88),
 ('macrophage inflammatory protein', 54.31034482758619),
 ('maximal inspiratory pressure', 38.82051282051282),
 ('maximum intensity projection', 20.363636363636363),
 ('mycobacterium indicus pranii', 18.85714285714286),
 ('major intrinsic protein', 17.88888888888889),
 ('minimally invasive parathyroidectomy', 11.8),
 ('molecularly inversion probe', 11.0),
 ('malaria in pregnancy', 8.666666666666666),
 ('myoinhibitory peptide', 7.555555555555555),
 ('myd88 inhibitory peptide', 5.714285714285714)]

In [27]:
longforms, scores = zip(*longforms)

In [31]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [32]:
grounding_map

{}

In [30]:
gilda_ground('ESR1')

('HGNC', '3467', 'ESR1')

In [34]:
result = ground_with_gui(longforms, scores)

In [35]:
result

({'macrophage inflammatory protein': 'FPLX:Macrophage_Inflammatory_Proteins',
  'major intrinsic protein': 'FPLX:AQP',
  'malaria in pregnancy': 'ungrounded',
  'maximal inspiratory pressure': 'MESH:D000072277',
  'maximum intensity projection': 'ungrounded',
  'minimally invasive parathyroidectomy': 'ungrounded',
  'molecularly imprinted polymer': 'polymer',
  'molecularly inversion probe': 'ungrounded',
  'mycobacterium indicus pranii': 'ungrounded',
  'myd88 inhibitory peptide': 'ungrounded',
  'myoinhibitory peptide': 'ungrounded'},
 {'FPLX:Macrophage_Inflammatory_Proteins': 'Macrophage_Inflammatory_Proteins',
  'FPLX:AQP': 'AQP',
  'MESH:D000072277': 'Maximal Respiratory Pressures',
  'polymer': 'Molecularly Imprinted Polymer'},
 ['FPLX:AQP', 'FPLX:Macrophage_Inflammatory_Proteins'])

In [36]:
grounding_map, names, pos_labels = result

In [37]:
grounding_dict = {'MIP': grounding_map}

In [38]:
classifier = AdeftClassifier('MIP', pos_labels)

In [39]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [40]:
labeler = AdeftLabeler(grounding_dict)

In [41]:
corpus = labeler.build_from_texts(texts)

'polymer'