In [3]:
import pickle

from indra.literature.adeft_tools import universal_extract_text

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import AdeftClassifier
from adeft.modeling.find_anomalies import AdeftAnomalyDetector

In [4]:
with open('data/content/GH_refs.pkl', 'rb') as f:
    GH_refs = pickle.load(f)
with open('data/content/GH_content.pkl', 'rb') as f:
    GH_content = pickle.load(f)
with open('data/entrez_content/GH1_content.pkl', 'rb') as f:
    GH1_entrez = pickle.load(f)

In [8]:
GH_disamb = load_disambiguator('GH')

In [43]:
GH_disamb.grounding_dict

{'GH': {'growth hormone': 'HGNC:4261',
  'glycoside hydrolase': 'MESH:D006026',
  'gestational hypertension': 'ungrounded',
  'glycogenic hepatopathy': 'ungrounded'}}

In [11]:
GH_texts = [universal_extract_text(text, contains=['GH']) for text in GH_content.values()
            if text]

In [14]:
len(GH_texts)

9117

In [15]:
GH1_texts = [universal_extract_text(text) for text in GH1_entrez if text]

In [16]:
len(GH1_texts)

408

In [18]:
disambs = GH_disamb.disambiguate(GH_texts)

In [19]:
disambs[0]

('HGNC:4261',
 'GH1',
 {'HGNC:4261': 1.0, 'MESH:D006026': 0.0, 'ungrounded': 0.0})

In [48]:
anomalous_texts = [text for text, disamb in zip(GH_texts, disambs) if
                   disamb[0] == 'HGNC:4261' and set(disamb[2].values()) == set([0.0, 1.0])]

4817

In [36]:
GH_detector = AdeftAnomalyDetector(blacklist=['GH1', 'GH'])

In [44]:
param_grid = {'nu': [0.5, 0.6, 0.7], 'max_features': [5, 10, 20],
              'ngram_range': [(1, 1), (1, 2)]}

In [45]:
GH_detector.cv(GH1_texts, anomalous_texts, param_grid, n_jobs=8)

INFO: [2019-11-26 13:02:29] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.0 found for parameter values:
{'nu': 0.5, 'max_features': 10, 'ngram_range': (1, 1)}


In [46]:
GH_detector.cv_results

{'mean_fit_time': array([0.16244869, 0.57272477, 0.21738338, 0.57852497, 0.19949379,
        0.58279052, 0.20834751, 0.57075543, 0.1915998 , 0.58622084,
        0.19860215, 0.58121395, 0.20395479, 0.57543654, 0.20918937,
        0.58977809, 0.21994257, 0.49488001]),
 'std_fit_time': array([0.01567579, 0.01647477, 0.03048168, 0.02233889, 0.0305974 ,
        0.02115509, 0.02313954, 0.02503815, 0.00622446, 0.01364771,
        0.01617415, 0.01371395, 0.01448588, 0.02755995, 0.02968387,
        0.01586859, 0.01123935, 0.04730596]),
 'mean_score_time': array([0.15804343, 0.27334886, 0.17753038, 0.26328063, 0.18106966,
        0.28554573, 0.19294434, 0.28051977, 0.19592667, 0.28248997,
        0.18303943, 0.28556447, 0.17464871, 0.27556663, 0.19183102,
        0.28690486, 0.1922606 , 0.19702697]),
 'std_score_time': array([0.03068639, 0.02638121, 0.02629305, 0.04619839, 0.02616604,
        0.04008969, 0.02484096, 0.04675247, 0.02521806, 0.021546  ,
        0.02880851, 0.03816113, 0.0156302 , 

In [47]:
GH_detector.confidence_interval(GH_texts)

(0, 1)

GH is not very ambiguous. A model wasn't really necessary here. 