In [1]:
import pickle

from indra.literature.adeft_tools import universal_extract_text

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import AdeftClassifier
from adeft.modeling.find_anomalies import AdeftAnomalyDetector

from adeft.modeling.find_anomalies import AdeftTfidfVectorizer

INFO: [2020-01-09 16:10:07] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [2]:
agent_text = 'TF'

In [3]:
with open(f'../data/content/{agent_text}_refs.pkl', 'rb') as f:
    refs = pickle.load(f)
with open(f'../data/content/{agent_text}_content.pkl', 'rb') as f:
    content = pickle.load(f)
with open(f'../data/entrez_content/{agent_text}_content.pkl', 'rb') as f:
    entrez_content = pickle.load(f)

In [4]:
disamb = load_disambiguator(agent_text)

In [5]:
disamb.grounding_dict

{'TF': {'tissue factor': 'HGNC:3541',
  'transferrin': 'HGNC:11740',
  'tail flick': 'ungrounded',
  'transmitted founder': 'ungrounded',
  'theaflavin': 'CHEBI:CHEBI:136609',
  'tolylfluanid': 'CHEBI:CHEBI:75182',
  'thomsen friedenreich': 'ungrounded',
  'time frequency': 'ungrounded',
  'total flavonoids': 'ungrounded',
  'ab': 'ungrounded',
  'force': 'ungrounded',
  'tibiofemoral': 'ungrounded',
  'fat': 'ungrounded',
  'telephone filtered': 'ungrounded',
  'follicular': 'ungrounded',
  'f3': 'ungrounded',
  'serotransferrin': 'HGNC:11740'}}

In [6]:
texts = [universal_extract_text(text, contains=[agent_text]) for text in content.values()
         if text]

In [7]:
len(texts)

5039

In [8]:
entrez_texts = [universal_extract_text(text) for text in entrez_content if text]

In [9]:
len(entrez_texts)

340

In [10]:
disambs = disamb.disambiguate(texts)

In [11]:
print(disamb.info())

Disambiguation model for TF

Produces the disambiguations:
	F3*	HGNC:3541
	TF*	HGNC:11740
	Theaflavin*	CHEBI:CHEBI:136609
	Toylfluanid*	CHEBI:CHEBI:75182

Model statistics are not available.


In [12]:
anomalous_texts = [text for text, disamb in zip(texts, disambs) if
                   disamb[0] != 'HGNC:11740' and set(disamb[2].values()) == set([0.0, 1.0])]

In [13]:
len(anomalous_texts)

2328

In [14]:
detector = AdeftAnomalyDetector('../results/pubmed_dictionary.pkl', blacklist=['TF'])

In [15]:
tfidf = AdeftTfidfVectorizer('../results/pubmed_dictionary.pkl', max_features=5000)

In [16]:
tfidf.fit(entrez_texts[0:50])

INFO: [2020-01-09 16:32:32] gensim.utils - loading Dictionary object from ../results/pubmed_dictionary.pkl
INFO: [2020-01-09 16:32:33] gensim.utils - loaded ../results/pubmed_dictionary.pkl
INFO: [2020-01-09 16:32:33] gensim.corpora.dictionary - adding document #0 to Dictionary(0 unique tokens: [])
INFO: [2020-01-09 16:32:33] gensim.corpora.dictionary - built Dictionary(11386 unique tokens: ['000', '005', '05', '10', '100']...) from 50 documents (total 163082 corpus positions)
INFO: [2020-01-09 16:32:33] gensim.corpora.dictionary - discarding 5840 tokens: [('murphy', 1), ('mut', 1), ('mutageneis', 1), ('mutagenic', 1), ('myszka', 1), ('nangiana', 1), ('nicholls', 1), ('nii', 1), ('noncovalently', 1), ('oriented', 1)]...
INFO: [2020-01-09 16:32:33] gensim.corpora.dictionary - keeping 5000 tokens which were in no less than 1 and no more than 50 (=100.0%) documents
INFO: [2020-01-09 16:32:33] gensim.corpora.dictionary - resulting dictionary: Dictionary(5000 unique tokens: ['000', '005', '

AdeftTfidfVectorizer(dict_path='../results/pubmed_dictionary.pkl',
                     max_features=5000, stop_words=[])

In [17]:
X = tfidf.transform(entrez_texts[51:])

In [18]:
examples = [['genetics', 'women', 'mesenchymal'], ['stem', 'tfs', 'genetics']]

In [19]:
small = [tfidf.dictionary.doc2bow(doc) for doc in examples]

In [20]:
corpus = tfidf.model[small]

In [21]:
corpus.metadata

False

In [22]:
from gensim.matutils import corpus2csc

In [24]:
X = corpus2csc(corpus, num_terms=5000)

In [25]:
X.shape

(5000, 2)

In [26]:
corpus.corpus

[[(546, 1), (2337, 1)], [(546, 1), (993, 1), (4794, 1)]]

In [67]:
param_grid = {'nu': [0.05, 0.1, 0.15, 0.2], 'max_features': [100]}

In [68]:
detector.cv(entrez_texts, anomalous_texts, param_grid, n_jobs=8)

INFO: [2020-01-09 18:09:12] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of 0.34509293569289845 found for parameter values:
{'nu': 0.2, 'max_features': 100}
INFO: [2020-01-09 18:09:12] gensim.utils - loading Dictionary object from ../results/pubmed_dictionary.pkl
INFO: [2020-01-09 18:09:12] gensim.utils - loaded ../results/pubmed_dictionary.pkl
INFO: [2020-01-09 18:09:12] gensim.corpora.dictionary - adding document #0 to Dictionary(0 unique tokens: [])
INFO: [2020-01-09 18:09:13] gensim.corpora.dictionary - built Dictionary(13800 unique tokens: ['000', '005', '05', '10', '100']...) from 340 documents (total 223107 corpus positions)
INFO: [2020-01-09 18:09:13] gensim.corpora.dictionary - discarding 12829 tokens: [('000', 26), ('005', 10), ('110', 5), ('120', 9), ('121', 5), ('123', 4), ('125', 9), ('126', 2), ('150', 11), ('15056', 1)]...
INFO: [2020-01-09 18:09:13] gensim.corpora.dictionary - keeping 100 tokens which were in no less than 1 and no more than 34

In [69]:
detector.best_score

0.34509293569289845

In [70]:
detector.best_params

{'nu': 0.2, 'max_features': 100}

In [71]:
detector.sensitivity

0.6215635239281924

In [72]:
detector.specificity

0.723529411764706

In [42]:
splits[0][1]

array([   1,   10,   12,   14,   18,   20,   22,   27,   33,   35,   45,
         47,   49,   50,   68,   87,   93,   95,   96,   99,  103,  106,
        123,  125,  130,  135,  140,  148,  150,  151,  152,  157,  161,
        174,  184,  185,  189,  190,  192,  204,  218,  220,  227,  233,
        234,  238,  239,  240,  241,  242,  247,  252,  253,  256,  261,
        275,  277,  281,  285,  291,  306,  307,  312,  314,  321,  329,
        336,  338,  343,  347,  348,  355,  356,  357,  366,  375,  377,
        379,  383,  390,  391,  392,  400,  402,  405,  413,  434,  435,
        438,  442,  463,  470,  471,  476,  481,  501,  506,  514,  519,
        520,  521,  524,  527,  531,  537,  538,  540,  542,  552,  555,
        561,  574,  577,  587,  588,  589,  596,  597,  602,  604,  606,
        610,  622,  631,  634,  638,  644,  646,  652,  658,  661,  662,
        665,  679,  680,  686,  697,  700,  702,  704,  711,  712,  714,
        719,  731,  735,  753,  754,  764,  768,  7

In [73]:
detector.feature_importances()

[('transferrin', 4.755108162819198),
 ('tissue', 3.9647423187622897),
 ('iron', 3.689060083685279),
 ('identify', 2.8762116627001517),
 ('genetic', 2.862615150752703),
 ('plasma', 2.8555030288402605),
 ('interaction', 2.801698630579818),
 ('large', 2.7606277197476023),
 ('sequence', 2.7592925526559453),
 ('hfe', 2.64089570386582),
 ('interactions', 2.5746517086445797),
 ('identification', 2.5714542159144895),
 ('membrane', 2.56467585342454),
 ('acid', 2.454503766256504),
 ('polymorphisms', 2.370574475273075),
 ('fe', 2.3532828938751718),
 ('complex', 2.3523554253915955),
 ('spectrometry', 2.3479916898394593),
 ('serum', 2.2443862665107215),
 ('polymorphism', 2.1421655946301597),
 ('chain', 2.055696904055092),
 ('method', 2.053223986935547),
 ('individuals', 1.9728985653714828),
 ('risk', 1.9695299717343948),
 ('blood', 1.9275612260720865),
 ('new', 1.8290040446282352),
 ('investigated', 1.7977176411438438),
 ('healthy', 1.7684207409418318),
 ('mass', 1.7470756087062784),
 ('multiple', 

In [36]:
preds = detector.predict(texts)

In [23]:
len(preds)

5039

In [43]:
detector.sensitivity

0.6821431538142047

In [44]:
detector.specificity

0.7

In [26]:
detector.confidence_interval(texts)

(0.9442772863393372, 1)

In [27]:
1 - len([disamb for disamb in disambs if disamb[0] == 'HGNC:11740'])/len(disambs)

0.951379241913078

The confidence interval is not accurate here

In [45]:
detector.best_params

{'nu': 0.005, 'max_features': 1000}

In [28]:
2328 + 68

2396

In [29]:
2328/5

465.6

In [30]:
2328 + 340

2668

In [31]:
2668/5

533.6