In [3]:
import pickle

from indra.literature.adeft_tools import universal_extract_text

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import AdeftClassifier
from adeft.modeling.find_anomalies import AdeftAnomalyDetector

INFO: [2019-12-02 15:12:07] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [2]:
agent_text = 'TF'

In [3]:
with open(f'data/content/{agent_text}_refs.pkl', 'rb') as f:
    refs = pickle.load(f)
with open(f'data/content/{agent_text}_content.pkl', 'rb') as f:
    content = pickle.load(f)
with open(f'data/entrez_content/{agent_text}_content.pkl', 'rb') as f:
    entrez_content = pickle.load(f)

In [4]:
disamb = load_disambiguator(agent_text)

In [5]:
disamb.grounding_dict

{'TF': {'tissue factor': 'HGNC:3541',
  'transferrin': 'HGNC:11740',
  'tail flick': 'ungrounded',
  'transmitted founder': 'ungrounded',
  'theaflavin': 'CHEBI:CHEBI:136609',
  'tolylfluanid': 'CHEBI:CHEBI:75182',
  'thomsen friedenreich': 'ungrounded',
  'time frequency': 'ungrounded',
  'total flavonoids': 'ungrounded',
  'ab': 'ungrounded',
  'force': 'ungrounded',
  'tibiofemoral': 'ungrounded',
  'fat': 'ungrounded',
  'telephone filtered': 'ungrounded',
  'follicular': 'ungrounded',
  'f3': 'ungrounded',
  'serotransferrin': 'HGNC:11740'}}

In [6]:
texts = [universal_extract_text(text, contains=[agent_text]) for text in content.values()
         if text]

In [7]:
len(texts)

5039

In [9]:
entrez_texts = [universal_extract_text(text) for text in entrez_content if text]

In [10]:
len(entrez_texts)

340

In [11]:
disambs = disamb.disambiguate(texts)

In [20]:
disambs[5]

('HGNC:3541',
 'F3',
 {'CHEBI:CHEBI:136609': 0.0049276234891150175,
  'CHEBI:CHEBI:75182': 0.0041165543373885794,
  'HGNC:11740': 0.014490057876546144,
  'HGNC:3541': 0.9664821495120481,
  'ungrounded': 0.00998361478490201})

In [17]:
from collections import Counter
counts = Counter([disamb[0] for disamb in disambs])

In [18]:
counts

Counter({'HGNC:3541': 4380,
         'ungrounded': 364,
         'HGNC:11740': 245,
         'CHEBI:CHEBI:136609': 34,
         'CHEBI:CHEBI:75182': 16})

In [22]:
anomalous_texts = [text for text, disamb in zip(texts, disambs) if
                   disamb[0] != 'HGNC:11740' and set(disamb[2].values()) == set([0.0, 1.0])]

In [23]:
len(anomalous_texts)

2328

In [42]:
detector = AdeftAnomalyDetector(blacklist=['TF', 'binding', 'expression', 'cells', 'cell', 'gene', 'genes'])

In [46]:
param_grid = {'nu': [0.2, 0.4, 0.6], 'max_features': [10, 50, 100],
              'ngram_range': [(1, 1), (1, 2)]}

In [47]:
detector.cv(texts, anomalous_texts, param_grid, n_jobs=8, cv=5)

INFO: [2019-11-26 16:20:16] /Users/albertsteppi/adeft/adeft/modeling/find_anomalies.py - Best score of -0.17802450262126845 found for parameter values:
{'nu': 0.6, 'max_features': 100, 'ngram_range': (1, 1)}


In [48]:
detector.feature_importances()

[('factor', 230.81908884713255),
 ('tfs', 195.3469401252208),
 ('activity', 185.14070610095345),
 ('et', 177.17540141791963),
 ('al', 176.75007547106193),
 ('induced', 169.05868761197843),
 ('protein', 159.155108795253),
 ('patients', 154.46511094966945),
 ('fviia', 152.41886024636182),
 ('coagulation', 152.26481385598825),
 ('tissue', 151.9379876715055),
 ('thrombin', 147.39801702812437),
 ('activation', 137.66463160005833),
 ('cancer', 137.40187731728537),
 ('fig', 135.98362268739376),
 ('transcription', 135.38367832922555),
 ('human', 133.1708596967484),
 ('tfpi', 130.65454674897745),
 ('mice', 126.96838805368198),
 ('levels', 125.98333545731029),
 ('study', 123.01775222607336),
 ('increased', 122.0480621208196),
 ('effect', 120.67555143099669),
 ('blood', 118.66023996207686),
 ('tumor', 116.2132758588517),
 ('data', 115.39558394336129),
 ('dna', 115.20453628478484),
 ('figure', 115.17955318957232),
 ('using', 112.27192834423542),
 ('endothelial', 112.15231773575996),
 ('10', 111.84

In [49]:
preds = detector.predict(texts)

In [50]:
len(preds)

5039

In [51]:
sum(preds)

3025.0

In [52]:
detector.sensitivity

0.5760247357976833

In [53]:
detector.specificity

0.24595076158104826

In [136]:
AR_detector.confidence_interval(AR_texts)

(0.2701708652367167, 0.3080066092612979)

In [139]:
1 - len([disamb for disamb in disambs if disamb[0] == 'HGNC:644'])/len(disambs)

0.3074424381389128

The confidence interval is very accurate here

In [1]:
from adeft import available_shortforms

In [2]:
available_shortforms

{'PC': 'PC',
 'EMT': 'EMT',
 'SP': 'SP',
 'PE': 'PE',
 'ROS': 'ROS',
 'NP': 'NP:NP_S',
 'NPs': 'NP:NP_S',
 'MS': 'MS',
 'MT': 'MT',
 'BP': 'BP',
 'GH': 'GH',
 'AD': 'AD',
 'GT': 'GT',
 'DA': 'DA',
 'GR': 'GR',
 'IR': 'IR',
 'HK2': 'HK2',
 'ARF': 'ARF',
 'CS': 'CS',
 'EC': 'EC',
 'STD': 'STD',
 'PD1': 'PD1',
 'TGH': 'TGH',
 'PKD': 'PKD',
 'RA': 'RA',
 'PCP': 'PCP',
 'PI': 'PI',
 'PS': 'PS',
 'PA': 'PA',
 'MB': 'MB',
 'HA': 'HA',
 'AR': 'AR',
 'HR': 'HR',
 'NE': 'NE',
 'UBC': 'UBC',
 'GSC': 'GSC',
 'AA': 'AA',
 'NIS': 'NIS',
 'GC': 'GC',
 'CM': 'CM',
 'RB': 'RB:R_B',
 'Rb': 'RB:R_B',
 'LH': 'LH',
 'ER': 'ER',
 'TF': 'TF',
 'PGP': 'PGP',
 'MCT': 'MCT',
 'TG': 'TG'}

In [5]:
disambs = [load_disambiguator(shortform) for shortform in set(available_shortforms.keys())]

In [6]:
names = [disamb.names for disamb in disambs]

In [8]:
names[1]

{'CHEBI:CHEBI:18303': 'phosphatidyl-L-serine',
 'HGNC:9456': 'PROS1',
 'HGNC:9498': 'PSAP',
 'HGNC:9508': 'PSEN1'}

In [9]:
names

[{'HGNC:10261': 'ROS1',
  'MESH:D012374': 'Rod Outer Segment',
  'MESH:D017382': 'Reactive Oxygen Species'},
 {'CHEBI:CHEBI:18303': 'phosphatidyl-L-serine',
  'HGNC:9456': 'PROS1',
  'HGNC:9498': 'PSAP',
  'HGNC:9508': 'PSEN1'},
 {'CHEBI:CHEBI:29865': 'benzo[a]pyrene',
  'MESH:D001794': 'Blood Pressure',
  'MESH:D004164': 'Diphosphonates'},
 {'CHEBI:CHEBI:17642': 'pentachlorophenol',
  'CHEBI:CHEBI:8058': 'phencyclidine',
  'HGNC:1067': 'BMP1',
  'HGNC:9344': 'PRCP',
  'MESH:D016764': 'Cell Polarity'},
 {'HGNC:4612': 'GSC'},
 {'CHEBI:CHEBI:33322': 'RUBIDIUM',
  'CHEBI:CHEBI:52261': 'ROSE-BENGAL',
  'GO:GO:0045730': 'RESPIRATORY-BURST',
  'HGNC:9884': 'RB1'},
 {'CHEBI:CHEBI:79516': 'Nonylphenol',
  'FPLX:Natriuretic_peptide': 'Natriuretic Peptide',
  'HGNC:33173': 'CTF2P',
  'MESH:D000070614': 'Nucleus Pulposus',
  'MESH:D009437': 'Neuralgia',
  'MESH:D009698': 'Nucleoproteins',
  'MESH:D019590': 'Nucleocapsid Proteins',
  'MESH:D053758': 'Nanoparticles'},
 {'HGNC:13633': 'ADIPOQ',
  'M