In [1]:
import os

import dist_sup_lib.utils as utils
import dist_sup_lib.rel_ext as rel_ext

from src.utils import read_json_examples
from src.utils import read_kb_triples
from src.utils import read_kb_triples_json

from src.rel_extract_extend.data import DatasetExt
from src.rel_extract_extend.kfold import make_kfold_val

from src.rel_extract_extend.featurizers import start_bag_of_words_featurizer
from src.rel_extract_extend.featurizers import middle_bag_of_words_featurizer
from src.rel_extract_extend.featurizers import end_bag_of_words_featurizer

In [2]:
utils.fix_random_seeds()
rel_ext_data_sents = os.path.join('data', 'featurized_sentences')
rel_ext_data_kb = os.path.join("data", "knowledge_base")

example_data = []

for index in range(1, 20):
    s_i = str(index)
    zeros = "0" * (4 - len(s_i))
    # Files updated constantly and the names changes by year
    # for files downloaded in 2021 file_naming has to be changed to
    # featurized_sents_pubmed21n
    tagged_sent_file = f"featurized_sents_pubmed20n{zeros + s_i}.json"
    file_path = os.path.join(rel_ext_data_sents, tagged_sent_file)
    example_data.extend(read_json_examples(file_path))

kb_triples = read_kb_triples_json(os.path.join(rel_ext_data_kb, "rel_drug_react_triple_occ_all.json"))
kb = rel_ext.KB(kb_triples)

corpus = rel_ext.Corpus(example_data)
dataset = DatasetExt(corpus, kb)

In [18]:
k = 5

results, train_setups, test_setups = make_kfold_val(
    dataset, 
    [
        start_bag_of_words_featurizer, 
        middle_bag_of_words_featurizer, 
        end_bag_of_words_featurizer
    ],
    avg_results=False,
    k=k
)

{'0': Corpus with 14,864 examples; KB with 10,700 triples, '1': Corpus with 10,327 examples; KB with 9,650 triples, '2': Corpus with 10,011 examples; KB with 11,019 triples, '3': Corpus with 19,112 examples; KB with 10,295 triples, '4': Corpus with 20,222 examples; KB with 13,403 triples, 'all': Corpus with 74,536 examples; KB with 55,067 triples}




relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
acquired                  1.000      1.000      1.000          3        448
aggravated                0.943      0.902      0.922         92        537
altered                   1.000      0.870      0.930         23        468
caused                    0.963      0.986      0.975       8924       9369
changed                   1.000      0.783      0.878         23        468
decreased                 0.877      0.948      0.911        734       1179
delayed                   1.000      1.000      1.000         13        458
discoloured               1.000      0.964      0.982         28        473
impaired                  1.000      0.933      0.966         15        460
increased                 0.879      0.953      0.915        765       1210
infected                  1.000      1.000      1.000         12        457
lowered     

  _warn_prf(average, modifier, msg_start, len(result))


relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
acquired                  1.000      1.000      1.000          2        409
aggravated                0.863      0.887      0.875         71        478
altered                   1.000      0.833      0.909         18        425
caused                    0.963      0.987      0.975       8117       8524
changed                   1.000      0.941      0.970         17        424
decreased                 0.870      0.956      0.911        659       1066
delayed                   1.000      1.000      1.000          8        415
discoloured               1.000      0.913      0.955         23        430
impaired                  0.929      0.929      0.929         14        421
increased                 0.855      0.950      0.900        659       1066
infected                  1.000      1.000      1.000          3        410
prolonged   

In [16]:
import pandas as pd

In [30]:
columns = ["precision", "recall", "f-score", "support", "size"]
total_results = {}

for key in kb.all_relations:
    rel_res = []
    for part_result in results:
        res = part_result.get(key)
        if res:
            rel_res.append(part_result[key])
        else:
            rel_res.append([0] * 5)
    total_results[key] = pd.DataFrame(data=rel_res, columns=columns)

In [40]:
avg_vals = {rel: {} for rel in total_results.keys()}
for rel, results in total_results.items():
    avg_precision = total_results[rel].precision.mean()
    avg_recall = total_results[rel].recall.mean()
    avg_vals[rel]["avg_precision"] = avg_precision
    avg_vals[rel]["avg_recall"] = avg_recall
    avg_vals[rel]["avg_fscore"] = (
        2 * avg_precision * avg_recall/(avg_precision + avg_recall)
        if avg_precision and avg_recall else 0
    )

In [41]:
avg_vals

{'accelerated': {'avg_precision': 0.0, 'avg_recall': 0.0, 'avg_fscore': 0},
 'acquired': {'avg_precision': 1.0,
  'avg_recall': 0.9866666666666667,
  'avg_fscore': 0.9932885906040269},
 'aggravated': {'avg_precision': 0.9150276618516674,
  'avg_recall': 0.911536410692215,
  'avg_fscore': 0.9132786997246934},
 'altered': {'avg_precision': 0.985103448275862,
  'avg_recall': 0.8764527260179434,
  'avg_fscore': 0.9276073583743355},
 'caused': {'avg_precision': 0.9664457810205747,
  'avg_recall': 0.9835598356293296,
  'avg_fscore': 0.9749277083194314},
 'changed': {'avg_precision': 1.0,
  'avg_recall': 0.9026517700901872,
  'avg_fscore': 0.9488354982030166},
 'decreased': {'avg_precision': 0.8807896042277534,
  'avg_recall': 0.9435276479683747,
  'avg_fscore': 0.9110798493316706},
 'delayed': {'avg_precision': 1.0,
  'avg_recall': 0.9623931623931623,
  'avg_fscore': 0.9808362369337978},
 'discoloured': {'avg_precision': 0.9671506352087114,
  'avg_recall': 0.9076343156562732,
  'avg_fscore':

In [21]:
data = pd.DataFrame(data=total_results, columns=columns)

In [22]:
data

Unnamed: 0,rel,precision_0,recall_0,f-score_0,support_0,size_0,precision_1,recall_1,f-score_1,support_1,...,precision_3,recall_3,f-score_3,support_3,size_3,precision_4,recall_4,f-score_4,support_4,size_4
0,accelerated,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
1,acquired,1.0,1.0,1.0,3,448,1.0,1.0,1.0,2,...,1.0,1.0,1.0,5,436,1.0,0.933333,0.965517,15,404
2,aggravated,0.943182,0.902174,0.922222,92,537,0.863014,0.887324,0.875,71,...,0.957447,0.918367,0.9375,98,529,0.915663,0.904762,0.91018,84,473
3,altered,1.0,0.869565,0.930233,23,468,1.0,0.833333,0.909091,18,...,0.965517,0.933333,0.949153,30,461,0.96,0.888889,0.923077,27,416
4,caused,0.963433,0.986105,0.974637,8924,9369,0.963453,0.987311,0.975236,8117,...,0.966043,0.986643,0.976235,8535,8966,0.976765,0.969401,0.973069,11275,11664
5,changed,1.0,0.782609,0.878049,23,468,1.0,0.941176,0.969697,17,...,1.0,0.894737,0.944444,19,450,1.0,1.0,1.0,17,406
6,decreased,0.876574,0.948229,0.910995,734,1179,0.870166,0.955994,0.911063,659,...,0.880051,0.956104,0.916502,729,1160,0.904387,0.900336,0.902357,893,1282
7,delayed,1.0,1.0,1.0,13,458,1.0,1.0,1.0,8,...,1.0,1.0,1.0,11,442,1.0,0.923077,0.96,13,402
8,discoloured,1.0,0.964286,0.981818,28,473,1.0,0.913043,0.954545,23,...,0.931034,0.964286,0.947368,28,459,0.931034,0.794118,0.857143,34,423
9,impaired,1.0,0.933333,0.965517,15,460,0.928571,0.928571,0.928571,14,...,1.0,0.823529,0.903226,17,448,1.0,0.75,0.857143,16,405


In [23]:
for x in data:
    print(x)

rel
precision_0
recall_0
f-score_0
support_0
size_0
precision_1
recall_1
f-score_1
support_1
size_1
precision_2
recall_2
f-score_2
support_2
size_2
precision_3
recall_3
f-score_3
support_3
size_3
precision_4
recall_4
f-score_4
support_4
size_4


Corpus with 25,191 examples; KB with 20,350 triples

In [3]:
def predict(splits, train_result, split_name='dev', sampling_rate=0.1, vectorize=True):
    assess_dataset = splits[split_name]
    assess_o, assess_y, test_setup = assess_dataset.build_dataset(sampling_rate=sampling_rate)
    test_X, _ = assess_dataset.featurize(
        assess_o,
        featurizers=train_result['featurizers'],
        vectorizer=train_result['vectorizer'],
        vectorize=vectorize)
    predictions = {}
    for rel in train_result['all_relations']:
        # ATTENTION: This could lead to errors
        try:
            if test_X.get(rel) is not None:
                predictions[rel] = train_result['models'][rel].predict(test_X[rel])
        except KeyError:
            print(f"WARNING: No entries for '{rel}")
    return predictions, assess_y, test_setup

In [5]:
def experiment(
        splits,
        featurizers,
        train_split='train',
        test_split='dev',
        model_factory=(lambda: LogisticRegression(
            fit_intercept=True, solver='liblinear', random_state=42)),
        train_sampling_rate=0.1,
        test_sampling_rate=0.1,
        vectorize=True,
        verbose=True):
    print("=========== Train Models ===========")
    train_result = train_models(
        splits,
        featurizers=featurizers,
        split_name=train_split,
        model_factory=model_factory,
        sampling_rate=train_sampling_rate,
        vectorize=vectorize,
        verbose=verbose)
    print("============= Predict =============")
    predictions, test_y = predict(
        splits,
        train_result,
        split_name=test_split,
        sampling_rate=test_sampling_rate,
        vectorize=vectorize)
    print("============ Evaluate =============")
    evaluate_predictions(
        predictions,
        test_y,
        verbose)
    return train_result

In [6]:
import os
import random

from collections import Counter
from collections import defaultdict

import dist_sup_lib.rel_ext as rel_ext
import dist_sup_lib.utils as utils

from src.utils import read_json_examples
from src.utils import read_kb_triples
from src.utils import read_kb_triples_json

In [8]:
utils.fix_random_seeds()
rel_ext_data_sents = os.path.join('data', 'featurized_sentences')
rel_ext_data_kb = os.path.join("data", "knowledge_base")

example_data = []

for index in range(1, 20):
    s_i = str(index)
    zeros = "0" * (4 - len(s_i))
    # Files updated constantly and the names changes by year
    # for files downloaded in 2021 file_naming has to be changed to
    # featurized_sents_pubmed21n
    tagged_sent_file = f"featurized_sents_pubmed20n{zeros + s_i}.json"
    file_path = os.path.join(rel_ext_data_sents, tagged_sent_file)
    example_data.extend(read_json_examples(file_path))

kb_triples = read_kb_triples_json(os.path.join(rel_ext_data_kb, "rel_drug_react_triple_occ_all.json"))
kb = rel_ext.KB(kb_triples)

corpus = rel_ext.Corpus(example_data)
dataset = rel_ext.Dataset(corpus, kb)

In [19]:
k = 5

split_names = [str(x) for x in range(k)]
split_fracs = [1/k for _ in range(k)]

assert sum(split_fracs) == 1

splits = dataset.build_splits(split_names=split_names, split_fracs=split_fracs)
splits

{'0': Corpus with 14,864 examples; KB with 10,700 triples,
 '1': Corpus with 10,327 examples; KB with 9,650 triples,
 '2': Corpus with 10,011 examples; KB with 11,019 triples,
 '3': Corpus with 19,112 examples; KB with 10,295 triples,
 '4': Corpus with 20,222 examples; KB with 13,403 triples,
 'all': Corpus with 74,536 examples; KB with 55,067 triples}

In [3]:
class A():
    def __init__(self, x: int):
        self.x = x
    
    def __add__(self, other):
        a = A(self.x)
        return a.x + other.x
    
    def __iadd__(self, other):
        self.x = self.x + other.x
        return self
    
    def __str__(self):
        return str(self.x)

a = A(2)
b = A(5)

print(a + b)
print(a)
a += b
print(a)
        

7
2
7


In [32]:
d1 = {1: 3, "d": {"1": 3}}
d2 = {2: 5, "f": {"d": 4}}
d1.update(d2)
d1

{1: 3, 'd': {'1': 3}, 2: 5, 'f': {'d': 4}}