In [50]:
import pickle

from indra.literature.adeft_tools import universal_extract_text

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import AdeftClassifier
from adeft.modeling.find_anomalies import AdeftAnomalyDetector

In [55]:
with open('data/content/TG_refs.pkl', 'rb') as f:
    TG_refs = pickle.load(f)
with open('data/content/TG_content.pkl', 'rb') as f:
    TG_content = pickle.load(f)
with open('data/entrez_content/TG_content.pkl', 'rb') as f:
    TG_entrez = pickle.load(f)

In [56]:
TG_disamb = load_disambiguator('TG')

In [57]:
TG_disamb.grounding_dict

{'TG': {'triglyceride': 'CHEBI:CHEBI:17855',
  'thapsigargin': 'CHEBI:CHEBI:9516',
  'triacylglycerol': 'CHEBI:CHEBI:17855',
  'transgenic': 'ungrounded',
  'trigeminal ganglion': 'ungrounded',
  'trigeminal ganglia': 'ungrounded',
  'thyroglobulin': 'HGNC:11764',
  'thrombin generation': 'ungrounded',
  'the': 'ungrounded',
  'transglutaminase': 'ungrounded',
  'log': 'ungrounded',
  'transgenic mice': 'ungrounded',
  'of': 'ungrounded',
  'group': 'ungrounded',
  'thioguanine': 'ungrounded',
  'and': 'ungrounded',
  'thioglycollate': 'ungrounded',
  'triacylglyceride': 'CHEBI:CHEBI:17855',
  'trophoblast giant': 'ungrounded',
  'd': 'ungrounded',
  'transplant glomerulopathy': 'ungrounded',
  'ln': 'ungrounded',
  'trigeminal': 'ungrounded',
  'for': 'ungrounded',
  'high risk': 'ungrounded',
  'with': 'ungrounded',
  'target gene': 'ungrounded',
  'to': 'ungrounded',
  'or': 'ungrounded',
  'transglycosylation': 'ungrounded'}}

In [81]:
TG_texts = [universal_extract_text(text, contains=['TG']) for text in TG_content.values()
            if text]

In [82]:
len(TG_texts)

7820

In [60]:
TG_entrez_texts = [universal_extract_text(text) for text in TG_entrez if text]

In [61]:
len(TG_entrez_texts)

176

In [83]:
disambs = TG_disamb.disambiguate(TG_texts)

In [85]:
disambs[2]

('ungrounded',
 None,
 {'CHEBI:CHEBI:9516': 0.0,
  'CHEBI:CHEBI:17855': 0.0,
  'HGNC:11764': 0.0,
  'ungrounded': 1.0})

In [88]:
anomalous_texts = [text for text, disamb in zip(TG_texts, disambs) if
                   not disamb[0].startswith('HGNC') and set(disamb[2].values()) == set([0.0, 1.0])]

In [89]:
len(anomalous_texts)

5837

In [90]:
a = [disamb for disamb in disambs if disamb[0].startswith('HGNC')]

In [91]:
len(a)

109

In [92]:
TG_detector = AdeftAnomalyDetector(blacklist=['TG'])

In [93]:
param_grid = {'nu': [0.2, 0.4, 0.6], 'max_features': [10, 50, 100],
              'ngram_range': [(1, 1), (1, 2)]}

In [94]:
TG_detector.cv(TG_entrez_texts, anomalous_texts, param_grid, n_jobs=8)

INFO: [2019-11-26 13:30:48] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.8400477446484829 found for parameter values:
{'nu': 0.2, 'max_features': 10, 'ngram_range': (1, 1)}


In [96]:
TG_detector.feature_importances()

[('thyroglobulin', 9.140296109158923),
 ('gene', 7.285102631462286),
 ('thyroid', 6.859173126477999),
 ('patients', 6.4009891282238724),
 ('disease', 4.8707526750566705),
 ('10', 4.077762563429923),
 ('analysis', 3.926549661139732),
 ('study', 3.7133358400852265),
 ('snps', 3.6538125862103623),
 ('association', 3.5264001449168236)]

In [98]:
preds = TG_detector.predict(TG_texts)

In [101]:
sum(preds)

7344.0

In [102]:
TG_detector.sensitivity

0.9218770175253255

In [103]:
TG_detector.specificity

0.9181707271231574

The anomaly detector did a great job of identifying how problematic TG is