In [1]:
import sys
from pathlib import Path
from tqdm import tqdm
import json

sys.path.append(Path.cwd() / "indra_stmt_classifier_model")

from indra_stmt_classifier_model import indra_stmt_classifier


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random_annotated_sample_data = []
with open(Path.cwd() / 'data' / 'indra_benchmark_corpus_annotated_sample.jsonl', 'r') as f:
    for line in f:
        random_annotated_sample_data.append(json.loads(line))
random_annotated_sample_data = random_annotated_sample_data[:100]



In [3]:
classifier_model_path = Path.cwd() / 'output' / 'indra_stmt_classifier' / 'checkpoint-790'
classifier = indra_stmt_classifier.IndraStmtClassifier(classifier_model_path)


In [4]:
actual_labels = [data['statement']['type'] for data in random_annotated_sample_data]
predicted_labels = []
for data in tqdm(random_annotated_sample_data, desc="Predicting labels", total=len(random_annotated_sample_data)):
    predicted_labels.append(classifier.predict(data['text'])['predicted_label'])


Predicting labels: 100%|██████████| 100/100 [00:17<00:00,  5.70it/s]


In [5]:
accuracy = sum([1 for actual, predicted in zip(actual_labels, predicted_labels) if actual == predicted]) / len(actual_labels)
accuracy


0.65

In [None]:
classifier.predict("X has a tendency to increase expression of Y")


{'predicted_label': 'IncreaseAmount',
 'confidence': 0.47127407789230347,
 'probabilities': {'Acetylation': 0.006579377688467503,
  'Activation': 0.3926742970943451,
  'Complex': 0.049027130007743835,
  'Deacetylation': 0.009745579212903976,
  'DecreaseAmount': 0.011002132669091225,
  'Demethylation': 0.010597997345030308,
  'Dephosphorylation': 0.0023687861394137144,
  'Desumoylation': 0.003114895662292838,
  'Deubiquitination': 0.0021336455829441547,
  'Glycosylation': 0.005952021572738886,
  'Hydroxylation': 0.003951319493353367,
  'IncreaseAmount': 0.47127407789230347,
  'Inhibition': 0.006291397847235203,
  'Methylation': 0.007436672691255808,
  'Phosphorylation': 0.0026918970979750156,
  'Sumoylation': 0.0030380042735487223,
  'Translocation': 0.004635949619114399,
  'Ubiquitination': 0.00748478015884757},
 'input_ids': tensor([[   2,   65, 2029,   42, 9098, 1701, 2461, 2181, 1685,   66,    3,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,

In [None]:
from indra_stmt_agents_ner_model.preprocess import *


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
input_path = Path.cwd() / 'data' / 'indra_benchmark_corpus_annotated_sample.jsonl'
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
examples = load_and_preprocess_from_raw_data(input_path)
label2id, id2label = build_label_mappings(examples, tokenizer)

from datasets import Dataset

hf_dataset = Dataset.from_list(examples)
processed_dataset = hf_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, tokenizer, label2id),
    batched=True
)


Loading and preprocessing: 2000it [00:00, 154004.19it/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1669.85 examples/s]


In [2]:
from indra_stmt_agents_ner_model import indra_stmt_agents_tagger

classifier_model_path = Path.cwd() / 'output' / 'indra_stmt_agents_ner' / 'checkpoint-790'

tagger = indra_stmt_agents_tagger.IndraAgentsTagger(classifier_model_path)


In [18]:
tagger.predict("Complex", "Ubiquitin coupled to the receptor then binds Hrs through its ubiquitin interacting motif, while Hrs also interacts simultaneously with membrane PtdIns3P via a FYVE domain and with clathrin, leading to the concentration of activated receptor molecules into specialized clathrin-coated regions of early endosomes [ xref \u2013 xref ].")


{'annotated_text': '<members.0>Ubiquitin </members.0>coupled to the receptor then binds <members.1>Hrs </members.1>through its ubiquitin interacting motif, while Hrs also interacts simultaneously with membrane PtdIns3P via a FYVE domain and with clathrin, leading to the concentration of activated receptor molecules into specialized clathrin-coated regions of early endosomes [ xref – xref ].',
 'tokens': ['[CLS]',
  'complex',
  '[SEP]',
  'ubiquitin',
  'coupled',
  'to',
  'the',
  'receptor',
  'then',
  'binds',
  'hrs',
  'through',
  'its',
  'ubiquitin',
  'interacting',
  'motif',
  ',',
  'while',
  'hrs',
  'also',
  'interacts',
  'simultaneously',
  'with',
  'membrane',
  'ptd',
  '##ins',
  '##3',
  '##p',
  'via',
  'a',
  'f',
  '##y',
  '##ve',
  'domain',
  'and',
  'with',
  'clathrin',
  ',',
  'leading',
  'to',
  'the',
  'concentration',
  'of',
  'activated',
  'receptor',
  'molecules',
  'into',
  'specialized',
  'clathrin',
  '-',
  'coated',
  'regions',
  '

In [4]:
from transformers import AutoTokenizer
from indra_stmt_agents_ner_model.preprocess import load_and_preprocess_from_raw_data, build_label_mappings, preprocess_examples_from_dataset
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

# ---- Load raw data ----
raw_examples = load_and_preprocess_from_raw_data(Path.cwd() / "data" /"indra_benchmark_corpus_annotated_stratified_sample.jsonl")


# ---- Build label mappings from dataset ----
label2id, id2label = build_label_mappings(raw_examples)

# ---- Convert to HuggingFace Dataset ----
dataset = Dataset.from_list(raw_examples)

# ---- Split dataset ----
split_dataset = dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = split_dataset["train"]
temp_dataset = split_dataset["test"]
val_test_split = temp_dataset.train_test_split(test_size=1/3, seed=42)
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

# ---- Preprocess for model ----
train_dataset = train_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, tokenizer, label2id),
    batched=False
)
val_dataset = val_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, tokenizer, label2id),
    batched=False
)
test_dataset = test_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, tokenizer, label2id),
    batched=False
)


Loading and preprocessing: 1800it [00:00, 155639.22it/s]
Map: 100%|██████████| 1260/1260 [00:01<00:00, 1199.52 examples/s]
Map: 100%|██████████| 360/360 [00:00<00:00, 1794.59 examples/s]
Map: 100%|██████████| 180/180 [00:00<00:00, 1830.77 examples/s]


In [5]:
train_dataset[0].keys()


dict_keys(['matches_hash', 'statement_type', 'annotated_text', 'input_ids', 'attention_mask', 'labels', 'tokens', 'ner_tags'])

In [12]:
train_dataset[1]['annotated_text']


'<subj>transforming growth factor-β1</subj> reduces <obj>BMP4 signaling</obj> in pulmonary artery smooth muscle cells, a response that is exacerbated on the background of reduced bone morphogenetic protein responsiveness due to BMPR-II mutations.'

In [None]:
id2label


{0: 'B-agent',
 1: 'B-enz',
 2: 'B-members',
 3: 'B-obj',
 4: 'B-sub',
 5: 'B-subj',
 6: 'I-agent',
 7: 'I-enz',
 8: 'I-members',
 9: 'I-obj',
 10: 'I-sub',
 11: 'I-subj',
 12: 'O'}

In [14]:
i = 100
for token, label, label_id in zip(train_dataset[i]['tokens'], train_dataset[i]['ner_tags'], train_dataset[i]['labels']):
    print(f"{token}, {label}, {label_id}")  # Print token and its corresponding label


[CLS], O, -100
de, O, -100
##ub, O, -100
##iqu, O, -100
##itin, O, -100
##ation, O, -100
[SEP], O, -100
the, O, 12
ubiquitin, B-enz, 1
mutants, O, 12
k2, O, 12
##9, O, 12
##r, O, 12
and, O, 12
k1, O, 12
##1r, O, 12
,, O, 12
on, O, 12
the, O, 12
other, O, 12
hand, O, 12
,, O, 12
decreased, O, 12
ed, B-sub, 4
##d, I-sub, 10
mediated, O, 12
ubiquitination, O, 12
and, O, 12
its, O, 12
effects, O, 12
on, O, 12
beta, O, 12
-, O, 12
catenin, O, 12
., O, 12
[SEP], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD], O, -100
[PAD],

In [9]:
from collections import Counter
all_labels = [l for example in train_dataset for l in example["labels"] if l != -100]
all_labels = [id2label[label] for label in all_labels]
print(Counter(all_labels))


Counter({'O': 39813, 'I-enz': 1058, 'I-sub': 904, 'B-enz': 867, 'B-sub': 811, 'I-obj': 543, 'I-subj': 486, 'B-subj': 278, 'B-obj': 272, 'I-members': 229, 'B-members': 142, 'B-agent': 75, 'I-agent': 75})


In [None]:
import json

with open(Path.cwd() / 'data' / 'indra_benchmark_corpus_annotated_stratified_sample.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# get only data of type 'Activation'
activation_data = [d for d in data if d['statement']['type'] == 'Activation']

# save activation data as jsonl
with open(Path.cwd() / 'data' / 'indra_benchmark_corpus_annotated_sample_activation.jsonl', 'w') as f:
    for d in activation_data:
        f.write(json.dumps(d) + '\n')
