In [1]:
import json

from collections import Counter
from collections import defaultdict

from src.utils import load_corpus
from src.utils import load_knowledge_base

from src.rel_extract_extend.data import DatasetExt

In [2]:
corpus = load_corpus(start_file_index=1, end_file_index=1016)
kb = load_knowledge_base()

In [3]:
print(corpus.examples[0])

Example(entity_1='weight', entity_2='sodium', left='Determinations of the molecular ', mention_1='weight', middle=' of the enzyme based on its amino acid composition, sedimentation velocity, and ', mention_2='sodium', right=' dodecyl sulfate gel electrophoresis gave values of 17680, 17470 and 18300, respectively.', left_POS='Determinations/NNS of/IN the/DT molecular/JJ', mention_1_POS='weight/NN', middle_POS='of/IN the/DT enzyme/NN based/VBN on/IN its/PRP$ amino/NN acid/NN composition/NN ,/, sedimentation/NN velocity/NN ,/, and/CC', mention_2_POS='sodium/NN', right_POS='dodecyl/NN sulfate/NN gel/JJ electrophoresis/NN gave/VBD values/NNS of/IN 17680/CD ,/, 17470/CD and/CC 18300/CD ,/, respectively/RB ./.')


In [4]:
print(kb)

KB with 510,316 triples


In [5]:
dataset = DatasetExt(corpus, kb)

In [6]:
dataset

Corpus with 5,227,699 examples; KB with 510,316 triples

In [7]:
dataset.__dir__()

['corpus',
 'kb',
 '__module__',
 'count_rel_ent_pairs',
 'create_kb_tpl_corp_covered',
 'remove_least_common_kb_ents',
 '__doc__',
 '__init__',
 'find_unrelated_pairs',
 'featurize',
 'build_dataset',
 'build_splits',
 'count_examples',
 'count_relation_combinations',
 '__str__',
 '__repr__',
 '__dict__',
 '__weakref__',
 '__hash__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__new__',
 '__reduce_ex__',
 '__reduce__',
 '__subclasshook__',
 '__init_subclass__',
 '__format__',
 '__sizeof__',
 '__dir__',
 '__class__']

In [8]:
len(dataset.find_unrelated_pairs())

320478

In [9]:
dataset.count_examples()

                                             examples
relation               examples    triples    /triple
--------               --------    -------    -------
accelerated                  78          1      78.00
acquired                    236        176       1.34
aggravated                15350       1106      13.88
altered                    5562        345      16.12
caused                  1440528     463713       3.11
changed                     676        277       2.44
decreased                205048      18721      10.95
delayed                     344        268       1.28
discoloured                2088        594       3.52
generalised                   0         89       0.00
impaired                   1287        319       4.03
improved                      2         77       0.03
increased                257805      22718      11.35
infected                   1353         88      15.38
lowered                       4         14       0.29
not achieved                

In [10]:
count_rel_ents = dataset.count_rel_ent_pairs()

In [11]:
print(count_rel_ents["increased"])

Counter({'start_react::cardiac::troponin': 14344, 'start_react::calcium::vitamin_d': 13607, 'start_react::cholesterol::blood_pressure': 10186, 'start_react::heparin::weight': 9632, 'start_react::cholesterol::lipids': 9448, 'start_react::sodium::weight': 9192, 'start_react::sodium::blood_pressure': 8539, 'start_react::cholesterol::weight': 7537, 'start_react::calcium::weight': 4033, 'start_react::medium::weight': 4021, 'start_react::calcium::blood_pressure': 3976, 'start_react::cholesterol::blood_cholesterol': 3881, 'start_react::calcium::energy': 3432, 'start_react::cholesterol::blood_glucose': 2535, 'start_react::potassium::blood_pressure': 2386, 'start_react::sodium::glomerular_filtration_rate': 2233, 'start_react::captopril::blood_pressure': 2106, 'start_react::calcium::manganese': 2101, 'start_react::sodium::energy': 2004, 'start_react::magnesium::manganese': 1921, 'start_react::nifedipine::blood_pressure': 1688, 'start_react::propranolol::heart_rate': 1612, 'start_react::potassium

In [12]:
with open("data/statistic/relation_drug_reaction_counts.json", "w") as json_file:
    json.dump(count_rel_ents, json_file, indent=2)

In [13]:
new_knowledge_base = dataset.create_kb_tpl_corp_covered(count_rel_ents)
print(len(new_knowledge_base))

77771


In [14]:
total_drug_react_counter =  dataset.remove_least_common_kb_ents(count_rel_ents)

In [15]:
print(total_drug_react_counter)

Counter({('caused', 'cardiac', 'death'): 38950, ('caused', 'cardiac', 'myocardial_infarction'): 19149, ('increased', 'cardiac', 'troponin'): 15909, ('decreased', 'calcium', 'vitamin_d'): 14606, ('increased', 'calcium', 'vitamin_d'): 14606, ('caused', 'cardiac', 'hypertension'): 12540, ('caused', 'tamoxifen', 'breast_cancer'): 12045, ('decreased', 'sodium', 'weight'): 10618, ('increased', 'sodium', 'weight'): 10618, ('increased', 'cholesterol', 'blood_pressure'): 10364, ('decreased', 'heparin', 'weight'): 9960, ('increased', 'heparin', 'weight'): 9960, ('increased', 'cholesterol', 'lipids'): 9910, ('decreased', 'sodium', 'blood_pressure'): 9303, ('increased', 'sodium', 'blood_pressure'): 9303, ('caused', 'cardiac', 'arrhythmia'): 9168, ('caused', 'sodium', 'hypertension'): 7934, ('decreased', 'cholesterol', 'weight'): 7887, ('increased', 'cholesterol', 'weight'): 7887, ('caused', 'nasal', 'rhinitis'): 7816, ('caused', 'nasal', 'nasal_obstruction'): 7278, ('caused', 'nasal', 'nasal_polyp

In [16]:
new_knowledge_base = [
    rel_drug_react for rel_drug_react in total_drug_react_counter.keys()
]

# Please change "all" for your naming, if you make a knowledge base subset
with open("data/knowledge_base/rel_drug_react_triple_occ_all.json", "w") as json_file:
    json.dump(new_knowledge_base, json_file, indent=2)

In [17]:
print(len(new_knowledge_base))

55067


In [19]:
dataset.kb.kb_triples_by_relation["increased"]

[KBTriple(rel='increased', sbj='metformin', obj='mean_cell_haemoglobin'),
 KBTriple(rel='increased', sbj='selenium', obj='blood_cholesterol'),
 KBTriple(rel='increased', sbj='famciclovir', obj='mean_cell_volume'),
 KBTriple(rel='increased', sbj='ciprofloxacin', obj='lipase'),
 KBTriple(rel='increased', sbj='rosuvastatin', obj='tumour_marker'),
 KBTriple(rel='increased', sbj='gatifloxacin', obj='blood_creatinine'),
 KBTriple(rel='increased', sbj='dexamfetamine', obj='creatinine_urine'),
 KBTriple(rel='increased', sbj='tobramycin', obj='prothrombin_level'),
 KBTriple(rel='increased', sbj='potassium', obj='sputum'),
 KBTriple(rel='increased', sbj='motilium', obj='blood_creatine_phosphokinase'),
 KBTriple(rel='increased', sbj='lapatinib', obj='body_temperature'),
 KBTriple(rel='increased', sbj='acetonide', obj='bone_density'),
 KBTriple(rel='increased', sbj='glycopyrronium', obj='hepatic_enzyme'),
 KBTriple(rel='increased', sbj='amlodipine', obj='carbohydrate_antigen_125'),
 KBTriple(rel='