In [1]:
import pickle

from indra.literature.adeft_tools import universal_extract_text

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import AdeftClassifier
from adeft.modeling.find_anomalies import AdeftAnomalyDetector

INFO: [2019-12-02 21:10:47] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [2]:
with open('data/content/AR_refs.pkl', 'rb') as f:
    AR_refs = pickle.load(f)
with open('data/content/AR_content.pkl', 'rb') as f:
    AR_content = pickle.load(f)
with open('data/entrez_content/AR_content.pkl', 'rb') as f:
    AR_entrez = pickle.load(f)

In [3]:
AR_disamb = load_disambiguator('AR')

In [4]:
AR_disamb.grounding_dict

{'AR': {'androgen receptor': 'HGNC:644',
  'aldose reductase': 'HGNC:381',
  'allergic rhinitis': 'ungrounded',
  'acrosome reaction': 'GO:GO:0007340',
  'amphiregulin': 'HGNC:651',
  'acute rejection': 'ungrounded',
  'pc 3': 'ungrounded',
  'resistant': 'ungrounded',
  'response': 'ungrounded',
  'aortic regurgitation': 'ungrounded',
  'autosomal recessive': 'ungrounded',
  'androgen': 'HGNC:644',
  'adrenoceptor': 'FPLX:ADRB',
  'adventitious root': 'ungrounded',
  'ar': 'ungrounded',
  'ratio': 'ungrounded',
  'antigen retrieval': 'ungrounded',
  'gene': 'ungrounded',
  'ankyrin repeat': 'ungrounded',
  'nes': 'ungrounded',
  'astragali radix': 'ungrounded',
  'rate': 'ungrounded',
  'red': 'ungrounded',
  'autoregressive': 'ungrounded',
  '”': 'ungrounded',
  'acid rain': 'ungrounded',
  'after ripening': 'ungrounded'}}

In [None]:
AR_texts = [universal_extract_text(text, contains=['AR']) for text in AR_content.values()
            if text]

In [114]:
len(AR_texts)

10467

In [115]:
AR_entrez_texts = [universal_extract_text(text) for text in AR_entrez if text]

In [116]:
len(AR_entrez_texts)

2152

In [117]:
disambs = AR_disamb.disambiguate(AR_texts)

In [118]:
disambs[2]

('HGNC:644',
 'AR',
 {'HGNC:381': 0.0,
  'FPLX:ADRB': 0.0,
  'ungrounded': 0.0,
  'HGNC:651': 0.0,
  'GO:GO:0007340': 0.0,
  'HGNC:644': 1.0})

In [119]:
anomalous_texts = [text for text, disamb in zip(AR_texts, disambs) if
                   disamb[0] != 'HGNC:644' and set(disamb[2].values()) == set([0.0, 1.0])]

In [120]:
len(anomalous_texts)

1688

In [91]:
len(a)

109

In [121]:
AR_detector = AdeftAnomalyDetector(blacklist=['AR'])

In [128]:
param_grid = {'nu': [0.01, 0.05, 0.1], 'max_features': [100],
              'ngram_range': [(1, 1)]}

In [129]:
AR_detector.cv(AR_entrez_texts, anomalous_texts, param_grid, n_jobs=8)

INFO: [2019-11-26 13:47:47] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.4735723935800604 found for parameter values:
{'nu': 0.05, 'max_features': 100, 'ngram_range': (1, 1)}


In [130]:
AR_detector.feature_importances()

[('androgen', 9.33449909669644),
 ('receptor', 8.624016369114962),
 ('genes', 5.52580672280176),
 ('cag', 5.253883510862585),
 ('patients', 5.012425001196676),
 ('gene', 4.941435518282439),
 ('cancer', 4.700616232096251),
 ('expression', 4.700563245183466),
 ('breast', 4.544330808472303),
 ('prostate', 4.519195855852777),
 ('human', 4.439268526513311),
 ('cell', 4.249908252271126),
 ('mutations', 4.238535092364618),
 ('protein', 4.12466746768825),
 ('men', 4.01941023682817),
 ('interaction', 3.89014558099108),
 ('tumors', 3.8701814398418617),
 ('activity', 3.8278600315181146),
 ('binding', 3.7916558084273433),
 ('pca', 3.773134614759867),
 ('repeats', 3.642490445503589),
 ('repeat', 3.6262930430461795),
 ('cases', 3.5309788060963765),
 ('high', 3.466578067422709),
 ('ligand', 3.4523743624359375),
 ('effect', 3.4014154522211517),
 ('study', 3.3620250190856686),
 ('dna', 3.3550014243001534),
 ('cells', 3.3548771695248116),
 ('dht', 3.3441016018360163),
 ('associated', 3.2886496155227714)

In [131]:
preds = AR_detector.predict(AR_texts)

In [133]:
len(preds)

10467

In [135]:
AR_detector.sensitivity

0.6599230944814145

In [134]:
AR_detector.specificity

0.8136492990986458

In [136]:
AR_detector.confidence_interval(AR_texts)

(0.2701708652367167, 0.3080066092612979)

In [139]:
1 - len([disamb for disamb in disambs if disamb[0] == 'HGNC:644'])/len(disambs)

0.3074424381389128

The confidence interval is very accurate here