In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['CF']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('cystic fibrosis', 2195),
 ('carboxyfluorescein', 134),
 ('cardiac fibroblasts', 112),
 ('climbing fiber', 107),
 ('coronary flow', 104),
 ('culture filtrate', 68),
 ('contact force', 38),
 ('chloroform', 30),
 ('caffeine', 27),
 ('counting factor', 19),
 ('cationized ferritin', 18),
 ('cocoa flavanol', 17),
 ('chemical fertilizer', 17),
 ('characteristic frequency', 17),
 ('compassion fatigue', 16),
 ('chloroform fraction', 16),
 ('clofibrate', 16),
 ('core factor', 16),
 ('compressive force', 15),
 ('cardiac fibrosis', 13),
 ('cytotoxic factor', 13),
 ('carbon fiber', 12),
 ('conductance filament', 11),
 ('carbon footprint', 10),
 ('cognitive function', 10),
 ('cells free', 10),
 ('climbing fibre', 10),
 ('cytoplasmic fragment', 10),
 ('chronic fatigue', 10),
 ('conventional fraction', 10),
 ('caspofungin', 10),
 ('core fucosylation', 9),
 ('conditions factor', 9),
 ('contraction frequency', 8),
 ('crude fiber', 8),
 ('competence factor', 8),
 ('calcium fructoborate', 7),
 ('comple

In [6]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-10-08 00:56:45] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [7]:
result = [grounding_map, names, pos_labels]

In [8]:
result

[{'caffeine': 'CHEBI:CHEBI:27732',
  'calcium folinate': 'CHEBI:CHEBI:31340',
  'calcium fructoborate': 'MESH:C507177',
  'calcofluor': 'ungrounded',
  'carbofuran': 'CHEBI:CHEBI:34611',
  'carbon fiber': 'MESH:D000077482',
  'carbon fibre': 'MESH:D000077482',
  'carbon foam': 'ungrounded',
  'carbon footprint': 'MESH:D058572',
  'carboxyfluorescein': 'CHEBI:CHEBI:138465',
  'cardiac failure': 'DOID:DOID:6000',
  'cardiac fibroblasts': 'CL:0002548',
  'cardiac fibrosis': 'ungrounded',
  'cardiorespiratory fitness': 'MESH:D000072599',
  'carthami flos': 'ungrounded',
  'caspofungin': 'CHEBI:CHEBI:474180',
  'cationized ferritin': 'MESH:C010297',
  'cellfood ™': 'ungrounded',
  'cells free': 'ungrounded',
  'center frequency': 'ungrounded',
  'cerebellofugal': 'ungrounded',
  'cf patients': 'ungrounded',
  'cftr knockout': 'ungrounded',
  'characteristic frequency': 'ungrounded',
  'chemical fertilizer': 'CHEBI:CHEBI:33287',
  'chemotactic factor': 'ungrounded',
  'chick fibroblasts': 'M

In [6]:
grounding_map, names, pos_labels = [{'caffeine': 'CHEBI:CHEBI:27732',
  'calcium folinate': 'CHEBI:CHEBI:31340',
  'calcium fructoborate': 'MESH:C507177',
  'calcofluor': 'ungrounded',
  'carbofuran': 'CHEBI:CHEBI:34611',
  'carbon fiber': 'MESH:D000077482',
  'carbon fibre': 'MESH:D000077482',
  'carbon foam': 'ungrounded',
  'carbon footprint': 'MESH:D058572',
  'carboxyfluorescein': 'CHEBI:CHEBI:138465',
  'cardiac failure': 'DOID:DOID:6000',
  'cardiac fibroblasts': 'CL:CL:0002548',
  'cardiac fibrosis': 'ungrounded',
  'cardiorespiratory fitness': 'MESH:D000072599',
  'carthami flos': 'ungrounded',
  'caspofungin': 'CHEBI:CHEBI:474180',
  'cationized ferritin': 'MESH:C010297',
  'cellfood ™': 'ungrounded',
  'cells free': 'ungrounded',
  'center frequency': 'ungrounded',
  'cerebellofugal': 'ungrounded',
  'cf patients': 'ungrounded',
  'cftr knockout': 'ungrounded',
  'characteristic frequency': 'ungrounded',
  'chemical fertilizer': 'CHEBI:CHEBI:33287',
  'chemotactic factor': 'ungrounded',
  'chick fibroblasts': 'MESH:D005347',
  'chickpea flour': 'ungrounded',
  'chitosan fiber': 'ungrounded',
  'chloroform': 'CHEBI:CHEBI:35255',
  'chloroform fraction': 'CHEBI:CHEBI:35255',
  'chlorophyll fluorescent': 'ungrounded',
  'chondrogenic factor': 'ungrounded',
  'chronic fatigue': 'HP:HP:0012432',
  'ciprofloxacin': 'CHEBI:CHEBI:100241',
  'cisplatin and 5 fluorouracil': 'ungrounded',
  'citrovorum factor': 'CHEBI:CHEBI:63606',
  'clastogenic factor': 'ungrounded',
  'cleavage furrow': 'GO:GO:0032154',
  'climbing fiber': 'GO:GO:0044301',
  'climbing fibre': 'GO:GO:0044301',
  'clofibrate': 'CHEBI:CHEBI:3750',
  'cloud fraction': 'ungrounded',
  'cocoa flavanol': 'ungrounded',
  'coelomic fluid': 'ungrounded',
  'coffee': 'ungrounded',
  'cognitive fatigability': 'OMIT:0006446',
  'cognitive fatigue': 'OMIT:0006446',
  'cognitive frailty': 'MESH:D000073496',
  'cognitive function': 'GO:GO:0050890',
  'colony formation': 'ungrounded',
  'compassion fatigue': 'MESH:D000068376',
  'competence factor': 'ungrounded',
  'complement fixation': 'ungrounded',
  'complement fixing': 'ungrounded',
  'complementary food': 'MESH:D005502',
  'compressive film': 'ungrounded',
  'compressive flow': 'ungrounded',
  'compressive force': 'ungrounded',
  'concentration factor': 'ungrounded',
  'concordance factor': 'ungrounded',
  'conditions factor': 'ungrounded',
  'conditions fear': 'ungrounded',
  'conductance filament': 'ungrounded',
  'coniferyl ferulate': 'MESH:C044571',
  'constant flow': 'ungrounded',
  'constant frequency': 'ungrounded',
  'contact force': 'ungrounded',
  'contamination frequency': 'ungrounded',
  'continuous flooding': 'ungrounded',
  'continuous flow': 'ungrounded',
  'contractile force': 'ungrounded',
  'contraction frequency': 'ungrounded',
  'control fed': 'ungrounded',
  'conventional fraction': 'ungrounded',
  'core factor': 'ungrounded',
  'core fucosylation': 'GO:GO:0036065',
  'corneal fibroblasts': 'CL:CL:0002363',
  'corni fructus': 'ungrounded',
  'coronary blood flow': 'ungrounded',
  'coronary flow': 'ungrounded',
  'correction factor': 'ungrounded',
  'cortical fibroblasts': 'ungrounded',
  'counting factor': 'ungrounded',
  'crotonis fructus': 'ungrounded',
  'crude fiber': 'ungrounded',
  'culture filtrate': 'ungrounded',
  'cystatin f': 'HGNC:2479',
  'cystic fibrosis': 'MESH:D003550',
  'cytoplasmic fragment': 'ungrounded',
  'cytosolic fraction': 'ungrounded',
  'cytostatic protein factor': 'ungrounded',
  'cytotoxic factor': 'ungrounded',
  'f508del cftr homozygous': 'ungrounded',
  'of cm and': 'ungrounded'},
 {'CHEBI:CHEBI:27732': 'caffeine',
  'CHEBI:CHEBI:31340': 'Calcium folinate',
  'MESH:C507177': 'calcium fructoborate',
  'CHEBI:CHEBI:34611': 'carbofuran',
  'MESH:D000077482': 'Carbon Fiber',
  'MESH:D058572': 'Carbon Footprint',
  'CHEBI:CHEBI:138465': '5(6)-carboxyfluorescein',
  'DOID:DOID:6000': 'congestive heart failure',
  'CL:CL:0002548': 'fibroblast of cardiac tissue',
  'MESH:D000072599': 'Cardiorespiratory Fitness',
  'CHEBI:CHEBI:474180': 'caspofungin',
  'MESH:C010297': 'polycationic ferritin',
  'CHEBI:CHEBI:33287': 'fertilizer',
  'MESH:D005347': 'Fibroblasts',
  'CHEBI:CHEBI:35255': 'chloroform',
  'HP:HP:0012432': 'Chronic fatigue',
  'CHEBI:CHEBI:100241': 'ciprofloxacin',
  'CHEBI:CHEBI:63606': '(6S)-5-formyltetrahydrofolic acid',
  'GO:GO:0032154': 'cleavage furrow',
  'GO:GO:0044301': 'climbing fiber',
  'CHEBI:CHEBI:3750': 'clofibrate',
  'OMIT:0006446': 'Mental Fatigue',
  'MESH:D000073496': 'Frailty',
  'GO:GO:0050890': 'cognition',
  'MESH:D000068376': 'Compassion Fatigue',
  'MESH:D005502': 'Food',
  'MESH:C044571': 'coniferyl ferulate',
  'GO:GO:0036065': 'fucosylation',
  'CL:CL:0002363': 'keratocyte',
  'HGNC:2479': 'CST7',
  'MESH:D003550': 'Cystic Fibrosis'},
 ['CHEBI:CHEBI:138465',
  'CHEBI:CHEBI:33287',
  'CL:CL:0002548',
  'GO:GO:0044301',
  'MESH:C010297',
  'MESH:D000068376',
  'MESH:D003550']]

In [7]:
excluded_longforms = []

In [8]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [9]:
additional_entities = {}

In [10]:
unambiguous_agent_texts = {}

In [11]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [15]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [16]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [17]:
intersection1

[('HGNC:17981', 'HGNC:17981', 29)]

In [18]:
intersection2

[('ungrounded', 'HGNC:17981', 0)]

In [20]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['RTCA', 'RTCD1', 'RPC', 'RTC1', 'RTC'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [None]:
names.update(additional_entitie)

In [12]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-23 03:12:21] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-23 03:13:56] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9634236850748318 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [13]:
classifier.stats

{'label_distribution': {'ungrounded': 411,
  'MESH:C010297': 16,
  'MESH:D003550': 1489,
  'GO:GO:0044301': 87,
  'CHEBI:CHEBI:27732': 13,
  'CHEBI:CHEBI:138465': 84,
  'CHEBI:CHEBI:3750': 9,
  'MESH:D005347': 1,
  'CHEBI:CHEBI:63606': 4,
  'CHEBI:CHEBI:100241': 6,
  'CHEBI:CHEBI:35255': 35,
  'DOID:DOID:6000': 4,
  'CL:0002548': 78,
  'CHEBI:CHEBI:31340': 3,
  'CHEBI:CHEBI:33287': 10,
  'CHEBI:CHEBI:34611': 4,
  'GO:GO:0032154': 4,
  'MESH:C507177': 6,
  'CHEBI:CHEBI:474180': 3,
  'CL:0002363': 3,
  'HP:HP:0012432': 5,
  'GO:GO:0036065': 6,
  'HGNC:2479': 3,
  'MESH:D000068376': 7,
  'MESH:D000077482': 12,
  'MESH:C044571': 2,
  'GO:GO:0050890': 7,
  'MESH:D058572': 7,
  'MESH:D005502': 2,
  'OMIT:OMIT:0006446': 4,
  'MESH:D000072599': 4,
  'MESH:D000073496': 3},
 'f1': {'mean': 0.963424, 'std': 0.004772},
 'precision': {'mean': 0.957548, 'std': 0.007412},
 'recall': {'mean': 0.972897, 'std': 0.005518},
 'MESH:D003550': {'f1': {'mean': 0.977536, 'std': 0.00394},
  'pr': {'mean': 0.993

In [13]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [14]:
disamb.dump(model_name, results_path)

In [15]:
print(disamb.info())

Disambiguation model for CF

Produces the disambiguations:
	(6S)-5-formyltetrahydrofolic acid	CHEBI:CHEBI:63606
	5(6)-carboxyfluorescein*	CHEBI:CHEBI:138465
	CST7	HGNC:2479
	Calcium folinate	CHEBI:CHEBI:31340
	Carbon Fiber	MESH:D000077482
	Carbon Footprint	MESH:D058572
	Cardiorespiratory Fitness	MESH:D000072599
	Chronic fatigue	HP:HP:0012432
	Compassion Fatigue*	MESH:D000068376
	Cystic Fibrosis*	MESH:D003550
	Fibroblasts	MESH:D005347
	Food	MESH:D005502
	Frailty	MESH:D000073496
	Mental Fatigue	OMIT:0006446
	caffeine	CHEBI:CHEBI:27732
	calcium fructoborate	MESH:C507177
	carbofuran	CHEBI:CHEBI:34611
	caspofungin	CHEBI:CHEBI:474180
	chloroform	CHEBI:CHEBI:35255
	ciprofloxacin	CHEBI:CHEBI:100241
	cleavage furrow	GO:GO:0032154
	climbing fiber*	GO:GO:0044301
	clofibrate	CHEBI:CHEBI:3750
	cognition	GO:GO:0050890
	congestive heart failure	DOID:DOID:6000
	coniferyl ferulate	MESH:C044571
	fertilizer*	CHEBI:CHEBI:33287
	fibroblast of cardiac tissue*	CL:CL:0002548
	fucosylation	GO:GO:0036065
	kerat

In [16]:
model_to_s3(disamb)