In [1]:
import pickle

from indra.literature.adeft_tools import universal_extract_text

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import AdeftClassifier
from adeft.modeling.find_anomalies import AdeftAnomalyDetector

ModuleNotFoundError: No module named 'adeft.modeling.find_anomalies'

In [2]:
agent_text = 'AD'
hgnc_name = 'ADIPOQ'
hgnc_id = 'HGNC:13633'

In [3]:
with open(f'data/content/{agent_text}_refs.pkl', 'rb') as f:
    refs = pickle.load(f)
with open(f'data/content/{agent_text}_content.pkl', 'rb') as f:
    content = pickle.load(f)
with open(f'data/entrez_content/{hgnc_name}_content.pkl', 'rb') as f:
    entrez_content = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'data/content/AD_refs.pkl'

In [59]:
disamb = load_disambiguator(agent_text)

In [60]:
disamb.grounding_dict

{'AD': {'s disease': 'MESH:D000544',
  'atopic dermatitis': 'MESH:D003876',
  'alcohol dependence': 'MESH:D000437',
  'axial diffusivity': 'ungrounded',
  'anaerobic digestion': 'ungrounded',
  'activation domain': 'ungrounded',
  'autosomal dominant': 'ungrounded',
  'antidepressant': 'ungrounded',
  's': 'ungrounded',
  'androstenedione': 'MESH:D000735',
  'afterdischarge': 'ungrounded',
  'dementia': 'ungrounded',
  'aortic dissection': 'ungrounded',
  'autonomic dysreflexia': 'ungrounded',
  'androgen deprivation': 'ungrounded',
  'adenocarcinoma': 'ungrounded',
  'aortic distensibility': 'ungrounded',
  'adenosine': 'ungrounded',
  'disorders': 'ungrounded',
  'adiponectin': 'HGNC:13633',
  'anoxic depolarization': 'ungrounded',
  'actinomycin d': 'ungrounded',
  'atherogenic diet': 'ungrounded',
  'p': 'ungrounded',
  'adrenaline': 'MESH:D004837',
  'patients': 'ungrounded',
  'alzheimer': 'MESH:D000544',
  '3 17 dione': 'ungrounded',
  'cases': 'ungrounded',
  'adriamycin': 'ung

In [61]:
texts = [universal_extract_text(text, contains=[agent_text]) for text in content.values()
         if text]

In [62]:
len(texts)

13133

In [63]:
entrez_texts = [universal_extract_text(text) for text in entrez_content if text]

In [64]:
len(entrez_texts)

2049

In [65]:
disambs = disamb.disambiguate(texts)

In [66]:
disamb.info()

'Disambiguation model for AD\n\nProduces the disambiguations:\n\tADIPOQ*\tHGNC:13633\n\tAlcoholism\tMESH:D000437\n\tAlzheimer Disease\tMESH:D000544\n\tAndrostenedione\tMESH:D000735\n\tDermatitis, Atopic\tMESH:D003876\n\tEpinephrine\tMESH:D004837\n\nModel statistics are not available.'

In [67]:
anomalous_texts = [text for text, disamb in zip(texts, disambs) if
                   disamb[0] != hgnc_id and set(disamb[2].values()) == set([0.0, 1.0])]

In [68]:
len(anomalous_texts)

10734

In [91]:
len(a)

109

In [69]:
detector = AdeftAnomalyDetector(blacklist=['AD'])

In [70]:
param_grid = {'nu': [0.01, 0.05, 0.1], 'max_features': [100],
              'ngram_range': [(1, 1)]}

In [71]:
detector.cv(entrez_texts, anomalous_texts, param_grid, n_jobs=8)

INFO: [2019-12-02 15:44:39] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.43764468526416467 found for parameter values:
{'nu': 0.1, 'max_features': 100, 'ngram_range': (1, 1)}


In [72]:
detector.feature_importances()

[('adiponectin', 24.076900233220364),
 ('apn', 14.440637123248656),
 ('gene', 11.585661093590954),
 ('serum', 9.492613020451502),
 ('levels', 9.47126868329198),
 ('associated', 9.441717064037311),
 ('leptin', 9.439173703437989),
 ('adipoq', 9.41710590753717),
 ('insulin', 8.946536257708775),
 ('human', 8.88095224260832),
 ('cells', 8.805737542732958),
 ('association', 8.576377218599577),
 ('cancer', 8.535744383725502),
 ('expression', 8.491760050111699),
 ('risk', 8.478306079463264),
 ('patients', 8.353630470030149),
 ('protein', 8.224327681894163),
 ('plasma', 8.21529776928965),
 ('obesity', 8.179828915567638),
 ('obese', 8.154944282611384),
 ('genetic', 7.74162601095031),
 ('type', 7.621764308283175),
 ('analysis', 7.392656384274771),
 ('diabetes', 7.374336620229934),
 ('study', 7.265491384558692),
 ('fat', 7.14758311704635),
 ('hmw', 7.141285119207971),
 ('polymorphisms', 7.138213765754079),
 ('women', 7.01090909231474),
 ('increased', 6.821080466360374),
 ('concentrations', 6.67321

In [73]:
preds = detector.predict(texts)

In [74]:
len(preds)

13133

In [75]:
sum(preds)

6672.0

In [76]:
detector.sensitivity

0.47531139703376823

In [77]:
detector.specificity

0.9623332882303964

In [78]:
detector.confidence_interval(texts)

(1, 1)

In [79]:
1 - len([disamb for disamb in disambs if disamb[0] == hgnc_id])/len(disambs)

0.9989339830960177

In [None]:
The confidence interval is highly accurate here