In [2]:
import os
import pickle

from indra.databases.hgnc_client import get_hgnc_name
from indra.literature.adeft_tools import universal_extract_text

from adeft.disambiguate import load_disambiguator
from adeft.modeling.find_anomalies import AdeftAnomalyDetector

INFO: [2019-12-03 13:07:34] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [117]:
def build_models(shortform):
    disambiguator = load_disambiguator(shortform)
    print(0)
    with open(f'data/content/{shortform}_content.pkl', 'rb') as f:
        content = pickle.load(f)
    texts = [universal_extract_text(text, contains=[shortform])
             for text in content.values() if text]
    disambs = disambiguator.disambiguate(texts)
    print(1)
    for grounding in disambiguator.names:
        if grounding.startswith('HGNC'):
            print(2, grounding)
            hgnc_id = grounding.rsplit(':')[1]
            hgnc_name = get_hgnc_name(hgnc_id)
            with open(f'data/entrez_content/{hgnc_name}_content.pkl', 'rb') as f:
                entrez_content = pickle.load(f)
            entrez_texts = [universal_extract_text(text)
                            for text in entrez_content if text]
            detector = AdeftAnomalyDetector(blacklist=[shortform])
            print(3, grounding)
            anomalous_texts = [text for text, disamb in zip(texts, disambs) if
                               disamb[0] != grounding and
                               set(disamb[2].values()) == set([0.0, 1.0])]
            param_grid = {'nu': [0.01, 0.05, 0.1, 0.2], 'max_features': [10, 50, 100],
                          'ngram_range': [(1, 1)]}
            detector.cv(entrez_texts, anomalous_texts, param_grid, n_jobs=10, cv=5)
            print(4, grounding)
            feature_importance = detector.feature_importances()
            sensitivity = detector.sensitivity
            specificity = detector.specificity
            params = detector.best_params
            cv_results = detector.cv_results
            result = {'fi': feature_importance, 'sens': sensitivity,
                      'spec': specificity, 'params': params, 'cv': cv_results}
            directory = os.path.join('results', shortform, grounding)
            print(5, grounding)
            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(os.path.join(directory, 'results.pkl'), 'wb') as f:
                pickle.dump(result, f) 
            print(6, grounding)

In [118]:
build_models('AR')

0
1
2 HGNC:381
3 HGNC:381


INFO: [2019-12-02 21:21:44] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.4565571641269406 found for parameter values:
{'nu': 0.2, 'max_features': 100, 'ngram_range': (1, 1)}


4 HGNC:381
5 HGNC:381
6 HGNC:381
2 HGNC:644
3 HGNC:644


INFO: [2019-12-02 21:21:56] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.4746048709479087 found for parameter values:
{'nu': 0.05, 'max_features': 100, 'ngram_range': (1, 1)}


4 HGNC:644
5 HGNC:644
6 HGNC:644
2 HGNC:651
3 HGNC:651


INFO: [2019-12-02 21:22:19] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.5677292102745168 found for parameter values:
{'nu': 0.2, 'max_features': 100, 'ngram_range': (1, 1)}


4 HGNC:651
5 HGNC:651
6 HGNC:651


In [112]:
with open('results/AR/HGNC:644/results.pkl', 'rb') as f:
    results = pickle.load(f)

In [22]:
with open('../results/SP/HGNC:9475/results.pkl', 'rb') as f:
    results = pickle.load(f)

In [23]:
results['params']

{'nu': 0.2, 'max_features': 50, 'ngram_range': (1, 1)}

In [25]:
results['sens']

0.7744022503516175

In [26]:
results['fi']

[('human', 3.307356495161779),
 ('trypsin', 3.132875633229689),
 ('pancreatitis', 3.0586583365015585),
 ('cells', 2.9852756180753177),
 ('proteins', 2.9726053238405408),
 ('protein', 2.797035123374756),
 ('activation', 2.5971825801804536),
 ('prss1', 2.584707510910278),
 ('cell', 2.4646361967198516),
 ('dna', 2.3697967467756316),
 ('cancer', 2.1962642927111156),
 ('patients', 2.1522461777374993),
 ('gene', 2.1519254622129322),
 ('genes', 2.0809366224966985),
 ('trypsinogen', 1.8783487109616448),
 ('mutation', 1.8393236679536735),
 ('pancreatic', 1.790170350762518),
 ('inhibitor', 1.7881957969319557),
 ('associated', 1.7090159665939157),
 ('identified', 1.595842500907057),
 ('expression', 1.5595286275304117),
 ('spink1', 1.540078569499547),
 ('using', 1.498263282181214),
 ('chronic', 1.491425523797326),
 ('risk', 1.4858060569370521),
 ('acute', 1.440768648117897),
 ('family', 1.3828644810453694),
 ('cftr', 1.3762290999569942),
 ('cp', 1.3533502588013024),
 ('variants', 1.334401218894839

In [107]:
disamb.names

{'FPLX:ADRB': 'ADRB',
 'GO:GO:0007340': 'Acrosome Reaction',
 'HGNC:381': 'AKR1B1',
 'HGNC:644': 'AR',
 'HGNC:651': 'AREG'}

In [82]:
disamb.names

{'HGNC:13633': 'ADIPOQ',
 'MESH:D000437': 'Alcoholism',
 'MESH:D000544': 'Alzheimer Disease',
 'MESH:D000735': 'Androstenedione',
 'MESH:D003876': 'Dermatitis, Atopic',
 'MESH:D004837': 'Epinephrine'}

In [81]:
t.rsplit(':')

['HGNC', '13633']

In [56]:
agent_text = 'AD'
hgnc_name = 'ADIPOQ'
hgnc_id = 'HGNC:13633'

In [58]:
with open(f'data/content/{agent_text}_refs.pkl', 'rb') as f:
    refs = pickle.load(f)
with open(f'data/content/{agent_text}_content.pkl', 'rb') as f:
    content = pickle.load(f)
with open(f'data/entrez_content/{hgnc_name}_content.pkl', 'rb') as f:
    entrez_content = pickle.load(f)

In [59]:
disamb = load_disambiguator(agent_text)

In [60]:
disamb.grounding_dict

{'AD': {'s disease': 'MESH:D000544',
  'atopic dermatitis': 'MESH:D003876',
  'alcohol dependence': 'MESH:D000437',
  'axial diffusivity': 'ungrounded',
  'anaerobic digestion': 'ungrounded',
  'activation domain': 'ungrounded',
  'autosomal dominant': 'ungrounded',
  'antidepressant': 'ungrounded',
  's': 'ungrounded',
  'androstenedione': 'MESH:D000735',
  'afterdischarge': 'ungrounded',
  'dementia': 'ungrounded',
  'aortic dissection': 'ungrounded',
  'autonomic dysreflexia': 'ungrounded',
  'androgen deprivation': 'ungrounded',
  'adenocarcinoma': 'ungrounded',
  'aortic distensibility': 'ungrounded',
  'adenosine': 'ungrounded',
  'disorders': 'ungrounded',
  'adiponectin': 'HGNC:13633',
  'anoxic depolarization': 'ungrounded',
  'actinomycin d': 'ungrounded',
  'atherogenic diet': 'ungrounded',
  'p': 'ungrounded',
  'adrenaline': 'MESH:D004837',
  'patients': 'ungrounded',
  'alzheimer': 'MESH:D000544',
  '3 17 dione': 'ungrounded',
  'cases': 'ungrounded',
  'adriamycin': 'ung

In [61]:
texts = [universal_extract_text(text, contains=[agent_text]) for text in content.values()
         if text]

In [62]:
len(texts)

13133

In [63]:
entrez_texts = [universal_extract_text(text) for text in entrez_content if text]

In [64]:
len(entrez_texts)

2049

In [65]:
disambs = disamb.disambiguate(texts)

In [66]:
disamb.info()

'Disambiguation model for AD\n\nProduces the disambiguations:\n\tADIPOQ*\tHGNC:13633\n\tAlcoholism\tMESH:D000437\n\tAlzheimer Disease\tMESH:D000544\n\tAndrostenedione\tMESH:D000735\n\tDermatitis, Atopic\tMESH:D003876\n\tEpinephrine\tMESH:D004837\n\nModel statistics are not available.'

In [67]:
anomalous_texts = [text for text, disamb in zip(texts, disambs) if
                   disamb[0] != hgnc_id and set(disamb[2].values()) == set([0.0, 1.0])]

In [68]:
len(anomalous_texts)

10734

In [91]:
len(a)

109

In [69]:
detector = AdeftAnomalyDetector(blacklist=['AD'])

In [70]:
param_grid = {'nu': [0.01, 0.05, 0.1], 'max_features': [100],
              'ngram_range': [(1, 1)]}

In [71]:
detector.cv(entrez_texts, anomalous_texts, param_grid, n_jobs=8)

INFO: [2019-12-02 15:44:39] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.43764468526416467 found for parameter values:
{'nu': 0.1, 'max_features': 100, 'ngram_range': (1, 1)}


In [72]:
detector.feature_importances()

[('adiponectin', 24.076900233220364),
 ('apn', 14.440637123248656),
 ('gene', 11.585661093590954),
 ('serum', 9.492613020451502),
 ('levels', 9.47126868329198),
 ('associated', 9.441717064037311),
 ('leptin', 9.439173703437989),
 ('adipoq', 9.41710590753717),
 ('insulin', 8.946536257708775),
 ('human', 8.88095224260832),
 ('cells', 8.805737542732958),
 ('association', 8.576377218599577),
 ('cancer', 8.535744383725502),
 ('expression', 8.491760050111699),
 ('risk', 8.478306079463264),
 ('patients', 8.353630470030149),
 ('protein', 8.224327681894163),
 ('plasma', 8.21529776928965),
 ('obesity', 8.179828915567638),
 ('obese', 8.154944282611384),
 ('genetic', 7.74162601095031),
 ('type', 7.621764308283175),
 ('analysis', 7.392656384274771),
 ('diabetes', 7.374336620229934),
 ('study', 7.265491384558692),
 ('fat', 7.14758311704635),
 ('hmw', 7.141285119207971),
 ('polymorphisms', 7.138213765754079),
 ('women', 7.01090909231474),
 ('increased', 6.821080466360374),
 ('concentrations', 6.67321

In [73]:
preds = detector.predict(texts)

In [74]:
len(preds)

13133

In [75]:
sum(preds)

6672.0

In [76]:
detector.sensitivity

0.47531139703376823

In [77]:
detector.specificity

0.9623332882303964

In [78]:
detector.confidence_interval(texts)

(1, 1)

In [79]:
1 - len([disamb for disamb in disambs if disamb[0] == hgnc_id])/len(disambs)

0.9989339830960177

In [None]:
The confidence interval is highly accurate here

In [119]:
os.path.join('..', 'results', 'a.py')

'../results/a.py'

In [27]:
from indra.literature.adeft_tools import get_text_content_for_pmids

In [28]:
get_text_content_for_pmids??