In [28]:
import pickle

from indra.literature.adeft_tools import universal_extract_text

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import AdeftClassifier
from adeft.modeling.find_anomalies import AdeftAnomalyDetector

In [32]:
agent_text = 'GC'
hgnc_name = 'GBA'

In [33]:
with open(f'data/content/{agent_text}_refs.pkl', 'rb') as f:
    refs = pickle.load(f)
with open(f'data/content/{agent_text}_content.pkl', 'rb') as f:
    content = pickle.load(f)
with open(f'data/entrez_content/{hgnc_name}_content.pkl', 'rb') as f:
    entrez_content = pickle.load(f)

In [34]:
disamb = load_disambiguator(agent_text)

In [35]:
disamb.grounding_dict

{'GC': {'gastric cancer': 'MESH:D013274',
  'glucocorticoid': 'MESH:D005938',
  'germinal center': 'ungrounded',
  'cells': 'ungrounded',
  'cyclase': 'FPLX:GUCY',
  'gastric carcinoma': 'MESH:D013274',
  'gene conversion': 'ungrounded',
  'germinal centre': 'ungrounded',
  'glucocerebrosidase': 'HGNC:4177',
  'gas chromatography': 'ungrounded',
  'growth cone': 'ungrounded',
  'glucocorticosteroids': 'MESH:D005938',
  'chitosan': 'ungrounded',
  'gustatory cortex': 'ungrounded',
  'glucosylceramide': 'HGNC:4177',
  'neisseria gonorrhoeae': 'ungrounded',
  'glucocorticoid hormone': 'MESH:D005938',
  'gastric adenocarcinoma': 'MESH:D013274',
  'carbon': 'ungrounded',
  'c': 'ungrounded',
  'control': 'ungrounded',
  'genomic classifier': 'ungrounded'}}

In [36]:
texts = [universal_extract_text(text, contains=[agent_text]) for text in content.values()
         if text]

In [37]:
len(texts)

8143

In [38]:
entrez_texts = [universal_extract_text(text) for text in entrez_content if text]

In [39]:
len(entrez_texts)

355

In [40]:
disambs = disamb.disambiguate(texts)

In [41]:
disamb.info()

'Disambiguation model for GC\n\nProduces the disambiguations:\n\tGBA*\tHGNC:4177\n\tGUCY*\tFPLX:GUCY\n\tGlucocorticoids\tMESH:D005938\n\tStomach Neoplasms\tMESH:D013274\n\nModel statistics are not available.'

In [42]:
anomalous_texts = [text for text, disamb in zip(texts, disambs) if
                   disamb[0] != 'HGNC:4177' and set(disamb[2].values()) == set([0.0, 1.0])]

In [43]:
len(anomalous_texts)

4521

In [91]:
len(a)

109

In [44]:
detector = AdeftAnomalyDetector(blacklist=['GC'])

In [45]:
param_grid = {'nu': [0.01, 0.05, 0.1], 'max_features': [100],
              'ngram_range': [(1, 1)]}

In [46]:
detector.cv(entrez_texts, anomalous_texts, param_grid, n_jobs=8)

INFO: [2019-12-02 15:01:58] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.6781034497476468 found for parameter values:
{'nu': 0.1, 'max_features': 100, 'ngram_range': (1, 1)}


In [47]:
detector.feature_importances()

[('glucocerebrosidase', 2.527763049552582),
 ('gaucher', 2.4279755044494125),
 ('disease', 2.4047516847007504),
 ('human', 2.2840302697021895),
 ('gba', 2.2383178775329964),
 ('proteins', 2.114225759297341),
 ('gba1', 1.9339734691323078),
 ('gcase', 1.8401752374131628),
 ('cells', 1.7160214693450375),
 ('syn', 1.6354404380733238),
 ('pd', 1.6125487199724262),
 ('associated', 1.5971235582636587),
 ('binding', 1.5636455019677706),
 ('genes', 1.5620571477850131),
 ('mutations', 1.490631376939289),
 ('analysis', 1.4780823634667524),
 ('type', 1.4597015515314744),
 ('mutation', 1.4130278983028932),
 ('lysosomal', 1.3940924883274937),
 ('cell', 1.3623796434144229),
 ('association', 1.3536276884065508),
 ('non', 1.3415446436036682),
 ('patients', 1.2901338900874975),
 ('activity', 1.237178959443848),
 ('glucosylceramidase', 1.2019428402596786),
 ('n370s', 1.194958741276297),
 ('beta', 1.191800672854855),
 ('sequence', 1.1262417494803363),
 ('gene', 1.1259812376266316),
 ('using', 1.1090716497

In [48]:
preds = detector.predict(texts)

In [49]:
len(preds)

8143

In [50]:
sum(preds)

5798.0

In [51]:
detector.sensitivity

0.7201970371094706

In [52]:
detector.specificity

0.9579064126381761

In [53]:
detector.confidence_interval(texts)

(0.9733319919443564, 1)

In [55]:
1 - len([disamb for disamb in disambs if disamb[0] == 'HGNC:4177'])/len(disambs)

0.9920176839002824

The confidence interval is not accurate here

In [None]:
The confidence interval is highly accurate here