In [2]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [3]:
shortforms = ['ABL']
genes = ['ABL1']
families = {'ABL_family': ['KCNH1', 'KCNH2', 'KCNH3', 'KCNH4',
                     'KCNH5', 'KCNH6', 'KCNH7', 'KCNH8']}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [4]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [5]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [6]:
top = miners['ABL'].top()

In [7]:
top

[('alveolar bone loss', 31.823529411764707),
 ('acetylbritannilactone', 11.125),
 ('agaricus bisporus lectin', 10.333333333333334),
 ('1 o acetylbritannilactone', 7.75),
 ('abetalipoproteinemia', 7.666666666666666),
 ('average sleep bout length', 7.4),
 ('abelson', 7.111111111111111),
 ('kinase', 5.5),
 ('acid bone lysate', 4.666666666666667),
 ('on alveolar bone loss', 4.0),
 ('loss', 3.7837837837837824),
 ('basolateral amygdala', 3.6),
 ('lectin', 3.571428571428571),
 ('level', 2.5),
 ('patients with abetalipoproteinemia', 2.5),
 ('binding loop', 2.4),
 ('abelson murine leukemia viral oncogene homolog', 2.333333333333333),
 ('abetalipoproteinaemia', 2.0),
 ('abelson kinase', 2.0),
 ('of alveolar bone loss', 2.0),
 ('vertical alveolar bone loss', 2.0),
 ('edible mushroom agaricus bisporus lectin', 2.0),
 ('the abelson tyrosine kinase 1', 2.0),
 ('a sleep duration and b average sleep bout length', 2.0),
 ('bone loss', 1.942857142857143),
 ('amygdala', 1.666666666666667),
 ('agaricus bi

In [7]:
odd = top[11][0]

In [8]:
odd

'basolateral amygdala'

In [20]:
grounder.ground('1-O-Acetylbritannilactone')

[]

In [8]:
longforms0 = miners['ABL'].get_longforms()

In [9]:
longforms0

[('alveolar bone loss', 34, 0.8399940091612061),
 ('acetylbritannilactone', 17, 0.5634141862971518),
 ('agaricus bisporus lectin', 12, 0.7566427025585514),
 ('abetalipoproteinemia', 10, 0.6989512039226189),
 ('average sleep bout length', 10, 0.6826730608584346),
 ('abelson', 9, 0.7306272408509953),
 ('acid bone lysate', 6, 0.7790157542358851),
 ('basolateral amygdala', 5, 0.7084951552942763),
 ('agaricus bisporus', 5, 0.6466115592082996),
 ('ablation', 3, 0.8723476565530921),
 ('abaloparatide', 3, 0.8686623494967154),
 ('adenylate binding loop', 3, 0.7984818687850783),
 ('abetalipoproteinaemia', 3, 0.7781069647142249),
 ('abelson kinase', 3, 0.49027361512639545),
 ('antigen binding lymphocytes', 2, 1.0),
 ('ampc beta lactamase', 2, 0.9048374180359595),
 ('atp binding loop', 2, 0.7866278610665535),
 ('activated b lymphoblastic', 1, 1.0),
 ('adult blood leukocytes', 1, 1.0),
 ('alpha blocker', 1, 1.0),
 ('apoptotic body like liposomes', 1, 0.8913012289830017),
 ('acetyl britannilactone',

In [10]:
from adeft.ground import AdeftGrounder

In [11]:
grounder = AdeftGrounder()

In [12]:
groundings = [(longform, grounder.ground(longform)) for longform, _, _ in longforms0]

In [19]:
groundings

[('alveolar bone loss',
  [{'name_space': 'MESH',
    'id': 'D016301',
    'canonical_name': 'Alveolar Bone Loss',
    'type': 'name',
    'raw_text': 'Alveolar Bone Loss',
    'longform_text': 'alveolar bone loss',
    'score': (1.0, 2)}]),
 ('acetylbritannilactone', []),
 ('agaricus bisporus lectin', []),
 ('abetalipoproteinemia',
  [{'name_space': 'MESH',
    'id': 'D000012',
    'canonical_name': 'Abetalipoproteinemia',
    'type': 'name',
    'raw_text': 'Abetalipoproteinemia',
    'longform_text': 'abetalipoproteinemia',
    'score': (1.0, 2)}]),
 ('average sleep bout length', []),
 ('abelson', []),
 ('acid bone lysate', []),
 ('basolateral amygdala',
  [{'name_space': 'MESH',
    'id': 'D000679',
    'canonical_name': 'Amygdala',
    'type': 'name',
    'raw_text': 'Amygdala',
    'longform_text': 'amygdala',
    'score': (1.0, 2)}]),
 ('agaricus bisporus', []),
 ('ablation', []),
 ('abaloparatide', []),
 ('adenylate binding loop', []),
 ('abetalipoproteinaemia', []),
 ('abelson

In [17]:
g = [x for x in groundings if x[1] and len(x[1][0]['longform_text'].split()) == len(x[0].split())]

In [18]:
g

[('alveolar bone loss',
  [{'name_space': 'MESH',
    'id': 'D016301',
    'canonical_name': 'Alveolar Bone Loss',
    'type': 'name',
    'raw_text': 'Alveolar Bone Loss',
    'longform_text': 'alveolar bone loss',
    'score': (1.0, 2)}]),
 ('abetalipoproteinemia',
  [{'name_space': 'MESH',
    'id': 'D000012',
    'canonical_name': 'Abetalipoproteinemia',
    'type': 'name',
    'raw_text': 'Abetalipoproteinemia',
    'longform_text': 'abetalipoproteinemia',
    'score': (1.0, 2)}]),
 ('pentyl acetate',
  [{'name_space': 'CHEBI',
    'id': 'CHEBI:87362',
    'canonical_name': 'pentyl acetate',
    'type': 'name',
    'raw_text': 'pentyl acetate',
    'longform_text': 'pentyl acetate',
    'score': (1.0, 2)}])]

In [53]:
longforms = [x for i, x in enumerate(longforms0) if i not in [2, 6, 7]]

In [13]:
!pip install python-levenshtein

Collecting python-levenshtein
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |████████████████████████████████| 51kB 3.8MB/s eta 0:00:011
Building wheels for collected packages: python-levenshtein
  Building wheel for python-levenshtein (setup.py) ... [?25ldone
[?25h  Created wheel for python-levenshtein: filename=python_Levenshtein-0.12.0-cp37-cp37m-macosx_10_14_x86_64.whl size=79615 sha256=baa8c609eaeb0375accda8cf76335deb18f5e20b04561fb4c6faeb62b716829e
  Stored in directory: /Users/albertsteppi/Library/Caches/pip/wheels/de/c2/93/660fd5f7559049268ad2dc6d81c4e39e9e36518766eaf7e342
Successfully built python-levenshtein
Installing collected packages: python-levenshtein
Successfully installed python-levenshtein-0.12.0
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [54]:
top[29]

('octadecenyl 2 acetylglycerol', 1.0)

In [55]:
longforms.extend([top[4], top[11], top[17], top[29]])

In [56]:
longforms.sort(key=lambda x: -x[1])

In [57]:
longforms

[('electroantennogram', 50.84313725490196),
 ('electroantennography', 9.833333333333332),
 ('ethanol extract of angelica gigas', 7.833333333333333),
 ('ether a go go', 7.8),
 ('electroantennographic', 5.714285714285714),
 ('experimental autoimmune glomerulonephritis', 4.0),
 ('ether à go go', 3.6),
 ('experimental autoimmune gastritis', 2),
 ('1 o hexadecyl 2 acetylglycerol', 2.0),
 ('octadecenyl 2 acetylglycerol', 1.0)]

In [58]:
longforms, scores = zip(*longforms)

In [49]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

RuntimeError: Received response with code{response.status_code}

In [50]:
grounding_map

{}

In [59]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [60]:
result = (grounding_map, names, pos_labels)

In [61]:
result

({'1 o hexadecyl 2 acetylglycerol': 'CHEBI:75882',
  'electroantennogram': 'electroantennogram',
  'electroantennographic': 'electroantennogram',
  'electroantennography': 'electroantennogram',
  'ethanol extract of angelica gigas': 'ungrounded',
  'ether a go go': 'FPLX:KCNH',
  'ether à go go': 'FPLX:KCNH',
  'experimental autoimmune gastritis': 'ungrounded',
  'experimental autoimmune glomerulonephritis': 'ungrounded',
  'octadecenyl 2 acetylglycerol': 'CHEBI:75882'},
 {'CHEBI:75882': '1-alkyl-2-acetylglycerol',
  'electroantennogram': 'electroantennogram',
  'FPLX:KCNH': 'KCNH'},
 ['CHEBI:75882', 'FPLX:KCNH'])

In [62]:
grounding_map, names, pos_labels = ({'1 o hexadecyl 2 acetylglycerol': 'CHEBI:75882',
      'electroantennogram': 'electroantennogram',
      'electroantennographic': 'electroantennogram',
      'electroantennography': 'electroantennogram',
      'ethanol extract of angelica gigas': 'ungrounded',
      'ether a go go': 'FPLX:KCNH',
      'ether à go go': 'FPLX:KCNH',
      'experimental autoimmune gastritis': 'ungrounded',
      'experimental autoimmune glomerulonephritis': 'ungrounded',
      'octadecenyl 2 acetylglycerol': 'CHEBI:75882'},
     {'CHEBI:75882': '1-alkyl-2-acetylglycerol',
      'electroantennogram': 'electroantennogram',
      'FPLX:KCNH': 'KCNH'},
     ['CHEBI:75882', 'FPLX:KCNH'])

In [63]:
grounding_dict = {'EAG': grounding_map}

In [64]:
classifier = AdeftClassifier('EAG', pos_labels=pos_labels)

In [65]:
param_grid = {'C': [10.0, 100.0], 'max_features': [1000, 10000]}

In [66]:
labeler = AdeftLabeler(grounding_dict)

In [67]:
corpus = labeler.build_from_texts(shortform_texts)

In [68]:
corpus.extend(entrez_texts)

In [69]:
texts, labels = zip(*corpus)

In [70]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-19 13:28:29] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [10.0, 100.0], 'max_features': [1000, 10000]}
INFO: [2019-12-19 13:30:33] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.99 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 1000}


In [71]:
classifier.stats

{'label_distribution': {'electroantennogram': 55,
  'FPLX:KCNH': 92,
  'ungrounded': 11,
  'CHEBI:75882': 3},
 'f1': {'mean': 0.9740682788051209, 'std': 0.015838198371254553},
 'precision': {'mean': 0.9589750692520773, 'std': 0.020548042527284788},
 'recall': {'mean': 0.99, 'std': 0.020000000000000018}}

In [72]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [73]:
d = disamb.disambiguate(shortform_texts)

In [83]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'electroantennogram']

In [87]:
a[1]

"The dual roles of CRY as a photoreceptor and a clock component are well-demonstrated in Malpighian tubules (MT) (Stanewsky et al.,  1998 ; Ivanchenko et al.,  2001 ). In this organ, a light pulse given during the subjective night induces the degradation of TIM, resulting in a reset of the phase of circadian oscillation. Light-induced TIM degradation was eliminated in  cry b , hypomorphic mutants lacking functional CRY. In addition, the PER and TIM oscillations in the MT disappeared in  cry b  mutants (Stanewsky et al.,  1998 ; Ivanchenko et al.,  2001 ). These studies suggest that CRY has roles both in light entrainment and in the molecular oscillatory machinery of the clock. Another example is the antenna. Its response to odorants, as measured by an electroantennogram (EAG), increases at night and decreases during the day under light-dark (LD) cycles (Krishnan et al.,  1999 ). The olfactory EAG rhythm is driven by a peripheral circadian clock in antennal olfactory sensory neurons, pe

In [74]:
disamb.dump('MED', '../results')

In [188]:
from adeft.disambiguate import load_disambiguator

In [189]:
d = load_disambiguator('HIR', '../results')

In [190]:
d.disambiguate(texts[0])

('HGNC:6091',
 'INSR',
 {'HGNC:6091': 0.9495047219792215,
  'MESH:D015427': 0.03378744088190508,
  'ungrounded': 0.016707837138873322})

In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')

In [89]:
model_to_s3(disamb)