In [3]:
import sys
from pathlib import Path
from tqdm import tqdm
import json

sys.path.append(Path.cwd() / "indra_stmt_classifier_model")

from indra_stmt_classifier_model import indra_stmt_classifier


In [2]:
random_annotated_sample_data = []
with open(Path.cwd() / 'data' / 'indra_benchmark_corpus_annotated_sample.jsonl', 'r') as f:
    for line in f:
        random_annotated_sample_data.append(json.loads(line))
random_annotated_sample_data = random_annotated_sample_data[:100]



In [3]:
classifier_model_path = Path.cwd() / 'output' / 'indra_stmt_classifier' / 'checkpoint-790'
classifier = indra_stmt_classifier.IndraStmtClassifier(classifier_model_path)


In [4]:
actual_labels = [data['statement']['type'] for data in random_annotated_sample_data]
predicted_labels = []
for data in tqdm(random_annotated_sample_data, desc="Predicting labels", total=len(random_annotated_sample_data)):
    predicted_labels.append(classifier.predict(data['text'])['predicted_label'])


Predicting labels: 100%|██████████| 100/100 [00:17<00:00,  5.70it/s]


In [5]:
accuracy = sum([1 for actual, predicted in zip(actual_labels, predicted_labels) if actual == predicted]) / len(actual_labels)
accuracy


0.65

In [None]:
classifier.predict("X has a tendency to increase expression of Y")


{'predicted_label': 'IncreaseAmount',
 'confidence': 0.47127407789230347,
 'probabilities': {'Acetylation': 0.006579377688467503,
  'Activation': 0.3926742970943451,
  'Complex': 0.049027130007743835,
  'Deacetylation': 0.009745579212903976,
  'DecreaseAmount': 0.011002132669091225,
  'Demethylation': 0.010597997345030308,
  'Dephosphorylation': 0.0023687861394137144,
  'Desumoylation': 0.003114895662292838,
  'Deubiquitination': 0.0021336455829441547,
  'Glycosylation': 0.005952021572738886,
  'Hydroxylation': 0.003951319493353367,
  'IncreaseAmount': 0.47127407789230347,
  'Inhibition': 0.006291397847235203,
  'Methylation': 0.007436672691255808,
  'Phosphorylation': 0.0026918970979750156,
  'Sumoylation': 0.0030380042735487223,
  'Translocation': 0.004635949619114399,
  'Ubiquitination': 0.00748478015884757},
 'input_ids': tensor([[   2,   65, 2029,   42, 9098, 1701, 2461, 2181, 1685,   66,    3,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,

In [None]:
from indra_stmt_agents_ner_model.preprocess import *


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
input_path = Path.cwd() / 'data' / 'indra_benchmark_corpus_annotated_sample.jsonl'
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
examples = load_and_preprocess_from_raw_data(input_path)
label2id, id2label = build_label_mappings(examples, tokenizer)

from datasets import Dataset

hf_dataset = Dataset.from_list(examples)
processed_dataset = hf_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, tokenizer, label2id),
    batched=True
)


Loading and preprocessing: 2000it [00:00, 154004.19it/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1669.85 examples/s]


In [None]:
from indra_stmt_agents_ner_model import indra_stmt_agents_tagger

classifier_model_path = Path.cwd() / 'output' / 'indra_stmt_agents_ner' / 'checkpoint-79'

tagger = indra_stmt_agents_tagger.IndraAgentsTagger(classifier_model_path)


In [None]:
tagger.predict("Activation", "X has a tendency to activate Y")


{'annotated_text': '<agent>X </agent><agent>has </agent><agent>a </agent><agent>tendency </agent><agent>to </agent><agent>activate </agent><agent>Y</agent>',
 'tokens': ['[CLS]',
  'activation',
  '[SEP]',
  'x',
  'has',
  'a',
  'tendency',
  'to',
  'activate',
  'y',
  '[SEP]'],
 'bio_tags': ['B-agent',
  'B-agent',
  'B-agent',
  'B-agent',
  'B-agent',
  'B-agent',
  'B-agent',
  'B-agent',
  'B-agent',
  'B-agent',
  'B-agent']}

In [None]:
from transformers import AutoTokenizer
from indra_stmt_agents_ner_model.preprocess import load_and_preprocess_from_raw_data, build_label_mappings, preprocess_examples_from_dataset
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

# ---- Load raw data ----
raw_examples = load_and_preprocess_from_raw_data(Path.cwd() / "data" /"indra_benchmark_corpus_annotated_stratified_sample.jsonl")

# ---- Build label mappings from dataset ----
label2id, id2label = build_label_mappings(raw_examples, tokenizer)

# ---- Convert to HuggingFace Dataset ----
dataset = Dataset.from_list(raw_examples)

# ---- Split dataset ----
split_dataset = dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = split_dataset["train"]
temp_dataset = split_dataset["test"]
val_test_split = temp_dataset.train_test_split(test_size=1/3, seed=42)
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

# ---- Preprocess for model ----
train_dataset = train_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, tokenizer, label2id),
    batched=True
)
val_dataset = val_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, tokenizer, label2id),
    batched=True
)
test_dataset = test_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, tokenizer, label2id),
    batched=True
)


Loading and preprocessing: 1800it [00:00, 211934.63it/s]
Map: 100%|██████████| 1260/1260 [00:00<00:00, 1890.65 examples/s]
Map: 100%|██████████| 360/360 [00:00<00:00, 1989.08 examples/s]
Map: 100%|██████████| 180/180 [00:00<00:00, 1964.11 examples/s]


In [None]:
label2id


{'B-agent': 0,
 'B-enz': 1,
 'B-members': 2,
 'B-obj': 3,
 'B-sub': 4,
 'B-subj': 5,
 'I-agent': 6,
 'I-enz': 7,
 'I-members': 8,
 'I-obj': 9,
 'I-sub': 10,
 'I-subj': 11,
 'O': 12}

In [None]:
train_dataset[0].keys()


dict_keys(['matches_hash', 'statement_type', 'annotated_text', 'input_ids', 'attention_mask', 'labels', 'tokens', 'ner_tags'])

In [None]:
train_dataset[0]['annotated_text']


'Moreover, modulation of DAX-1 and steroidogenic factor-1 intracellular levels in these cells suggests that these transcription factors could be involved in <subj>MAPK</subj> suppression of <obj>StAR</obj> expression.'

In [None]:
for token, label in zip(train_dataset[0]['tokens'], train_dataset[0]['labels']):
    print(f"{token}: {id2label[label]}")  # Print token and its corresponding label


[CLS]: O
inhibition: O
[SEP]: O
moreover: O
,: O
modulation: O
of: O
da: O
##x: O
-: O
1: O
and: O
steroidogenic: O
factor: O
-: O
1: O
intracellular: O
levels: O
in: O
these: O
cells: O
suggests: O
that: O
these: O
transcription: O
factors: O
could: O
be: O
involved: O
in: O
mapk: B-subj
suppression: O
of: O
star: B-obj
expression: O
.: O
[SEP]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[

In [None]:
from collections import Counter
all_labels = [l for example in train_dataset for l in example["labels"] if l != -100]
print(Counter(all_labels))


Counter({12: 639380, 7: 1058, 10: 904, 1: 867, 4: 811, 9: 543, 11: 486, 5: 278, 3: 272, 8: 229, 2: 142, 0: 75, 6: 75})
