In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['MSC', 'MSCs']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())

In [None]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, no_browser=True, port=8890)

In [7]:
result = [grounding_map, names, pos_labels]

In [8]:
result

[{'marijuana smoke condensate': 'ungrounded',
  'mcs': 'ungrounded',
  'mechanosensitive channel': 'FPLX:Mechanosensitive_Ion_Channels',
  'mechanosensitive channels': 'FPLX:Mechanosensitive_Ion_Channels',
  'mechanosensitive ion channels': 'FPLX:Mechanosensitive_Ion_Channels',
  'mesenchymal': 'MESH:D059630',
  'mesenchymal stem cells': 'MESH:D059630',
  'methanol extract of sorbus commixta cortex': 'ungrounded',
  'methyl l selenocysteine': 'CHEBI:CHEBI:27812',
  'methyl selenocysteine': 'CHEBI:CHEBI:27812',
  'methylselenocysteine': 'CHEBI:CHEBI:27812',
  'molecular sentinel on chip': 'ungrounded',
  'msc': 'ungrounded',
  'msc conditions medium': 'MESH:D059630',
  'mscs': 'ungrounded',
  'multi aminoacyl trna synthetase complex': 'MESH:D000604',
  'multi synthetase complex': 'MESH:D000604',
  'multi trna synthetase complex': 'MESH:D000604',
  'multifocal serpiginoid choroiditis': 'MESH:D000080364',
  'multiple aminoacyl trna synthetase complex': 'MESH:D000604',
  'multiple scatter 

In [26]:
grounding_map, names, pos_labels = [{'marijuana smoke condensate': 'ungrounded',
  'mcs': 'ungrounded',
  'mechanosensitive channel': 'FPLX:Mechanosensitive_ion_channels',
  'mechanosensitive channels': 'FPLX:Mechanosensitive_ion_channels',
  'mechanosensitive ion channels': 'FPLX:Mechanosensitive_ion_channels',
  'mesenchymal': 'MESH:D059630',
  'mesenchymal stem cells': 'MESH:D059630',
  'methanol extract of sorbus commixta cortex': 'ungrounded',
  'methyl l selenocysteine': 'CHEBI:CHEBI:27812',
  'methyl selenocysteine': 'CHEBI:CHEBI:27812',
  'methylselenocysteine': 'CHEBI:CHEBI:27812',
  'molecular sentinel on chip': 'ungrounded',
  'msc': 'ungrounded',
  'msc conditions medium': 'MESH:D059630',
  'mscs': 'ungrounded',
  'multi aminoacyl trna synthetase complex': 'MESH:D000604',
  'multi synthetase complex': 'MESH:D000604',
  'multi trna synthetase complex': 'MESH:D000604',
  'multifocal serpiginoid choroiditis': 'MESH:D000080364',
  'multiple aminoacyl trna synthetase complex': 'MESH:D000604',
  'multiple scatter correction': 'ungrounded',
  'multisynthetase complex': 'MESH:D000604',
  'muscimol': 'CHEBI:CHEBI:7035',
  'musculin': 'HGNC:7321',
  'musculoskeletal complaints': 'ungrounded',
  'se methylseleno l cysteine': 'CHEBI:CHEBI:27812'},
 {'FPLX:Mechanosensitive_ion_channels': 'Mechanosensitive_ion_channels',
  'MESH:D059630': 'Mesenchymal Stem Cells',
  'CHEBI:CHEBI:27812': 'Se-methyl-L-selenocysteine',
  'MESH:D000604': 'Amino Acyl-tRNA Synthetases',
  'MESH:D000080364': 'Multifocal Choroiditis',
  'CHEBI:CHEBI:7035': 'muscimol',
  'HGNC:7321': 'MSC'},
 ['CHEBI:CHEBI:27812',
  'CHEBI:CHEBI:7035',
  'FPLX:Mechanosensitive_ion_channels',
  'HGNC:7321',
  'MESH:D000080364',
  'MESH:D000604',
  'MESH:D059630']]

In [27]:
excluded_longforms = ['msc', 'mcs', 'mscs']

In [28]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [29]:
additional_entities = {'HGNC:7321': ['MSC', ['MSC', 'musculin', 'Musculin']]}

In [30]:
unambiguous_agent_texts = {'HGNC:7321': ['MSC', ['musculin', 'Musculin']]}

In [31]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)


entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [32]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [33]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [34]:
intersection1

[('HGNC:7321', 'HGNC:7321', 21)]

In [35]:
intersection2

[('CHEBI:CHEBI:27812', 'HGNC:7321', 0),
 ('MESH:D059630', 'HGNC:7321', 0),
 ('ungrounded', 'HGNC:7321', 0),
 ('MESH:D000604', 'HGNC:7321', 0),
 ('CHEBI:CHEBI:7035', 'HGNC:7321', 0),
 ('FPLX:Mechanosensitive_ion_channels', 'HGNC:7321', 0),
 ('HGNC:7321', 'HGNC:7321', 0),
 ('MESH:D000080364', 'HGNC:7321', 0)]

In [36]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [37]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [38]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-30 03:09:04] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-30 03:10:52] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.9880264679919669 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [39]:
classifier.stats

{'label_distribution': {'CHEBI:CHEBI:27812': 49,
  'MESH:D059630': 4506,
  'ungrounded': 13,
  'MESH:D000604': 23,
  'CHEBI:CHEBI:7035': 2,
  'FPLX:Mechanosensitive_ion_channels': 19,
  'HGNC:7321': 11,
  'MESH:D000080364': 3},
 'f1': {'mean': 0.988026, 'std': 0.003813},
 'precision': {'mean': 0.985542, 'std': 0.004788},
 'recall': {'mean': 0.991545, 'std': 0.002862},
 'MESH:D059630': {'f1': {'mean': 0.994262, 'std': 0.001619},
  'pr': {'mean': 0.999556, 'std': 0.000888},
  'rc': {'mean': 0.989028, 'std': 0.002926}},
 'MESH:D000604': {'f1': {'mean': 0.949206, 'std': 0.063014},
  'pr': {'mean': 0.91, 'std': 0.111355},
  'rc': {'mean': 1.0, 'std': 0.0}},
 'CHEBI:CHEBI:7035': {'f1': {'mean': 0.0, 'std': 0.0},
  'pr': {'mean': 0.0, 'std': 0.0},
  'rc': {'mean': 0.0, 'std': 0.0}},
 'CHEBI:CHEBI:27812': {'f1': {'mean': 0.842484, 'std': 0.094266},
  'pr': {'mean': 0.74, 'std': 0.149666},
  'rc': {'mean': 1.0, 'std': 0.0}},
 'ungrounded': {'f1': {'mean': 0.0, 'std': 0.0},
  'pr': {'mean': 0.0,

In [40]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [41]:
disamb.dump(model_name, results_path)

In [42]:
print(disamb.info())

Disambiguation model for MSC, and MSCs

Produces the disambiguations:
	Amino Acyl-tRNA Synthetases*	MESH:D000604
	MSC*	HGNC:7321
	Mechanosensitive_ion_channels*	FPLX:Mechanosensitive_ion_channels
	Mesenchymal Stem Cells*	MESH:D059630
	Multifocal Choroiditis*	MESH:D000080364
	Se-methyl-L-selenocysteine*	CHEBI:CHEBI:27812
	muscimol*	CHEBI:CHEBI:7035

Class level metrics:
--------------------
Grounding                    	Count	F1     
       Mesenchymal Stem Cells*	4506	0.99426
   Se-methyl-L-selenocysteine*	  49	0.84248
  Amino Acyl-tRNA Synthetases*	  23	0.94921
Mechanosensitive_ion_channels*	  19	0.64762
                   Ungrounded	  13	    0.0
                          MSC*	  11	0.21333
       Multifocal Choroiditis*	   3	    0.0
                     muscimol*	   2	    0.0

Weighted Metrics:
-----------------
	F1 score:	0.98803
	Precision:	0.98554
	Recall:		0.99154

* Positive labels
See Docstring for explanation



In [51]:
model_to_s3(disamb)

In [44]:
preds = [disamb.disambiguate(text) for text in all_texts.values()]

In [45]:
texts = [text for pred, text in zip(preds, all_texts.values()) if pred[0] == 'HGNC:7321']

In [46]:
len(texts)

4

In [50]:
texts[3]

'This study aimed to explore whether intrauterine infusion of peripheral blood mononuclear cells (PBMCs) could induce favorable transcriptomic changes in the endometrium for embryo implantation and the potential mechanism. Twenty-one mice were randomly divided to five groups, including a normal pregnancy (NP) group, an embryo implantation dysfunction (EID) group, an EID with human chorionic gonadotropin (hCG) group, an EID with PBMCs group, and an EID with hCG co-cultured with PBMCs group. The endometrium in the implantation window from mice were collected and determined by RNA sequencing (RNA-Seq), and the expression of significantly different genes with high degree of coincidence was recommended and validated by quantitative real-time polymerase chain reaction (qRT-PCR). There were totally 1,366 up-regulated and 1,374 down-regulated genes in the EID mice compared with the normal pregnant mice. We selected (fold change ≥2, P<0.05) and verified the candidate genes associated with embry