In [1]:
import sys
from pathlib import Path

sys.path.append((Path.cwd() / "ner_agent_detector").resolve().as_posix())
sys.path.append((Path.cwd() / "indra_stmt_classifier").resolve().as_posix())
sys.path.append((Path.cwd() / "utils").resolve().as_posix())


In [2]:
from ner_agent_detector.model import AgentNERModel
from indra_stmt_classifier.model import IndraStmtClassifier
from utils.annotate import annotate_entities

agent_detection_model = AgentNERModel("/Users/thomaslim/gyorilab/indra_bert/output/ner_agent_detection/checkpoint-2450")
indra_stmt_classifier_model = IndraStmtClassifier("/Users/thomaslim/gyorilab/indra_bert/output/indra_stmt_classifier/checkpoint-790")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
example_text = "C5a promotes the proliferation of human nasopharyngeal carcinoma cells through PCAF-mediated STAT3 acetylation."


In [4]:
agent_prediction = agent_detection_model.predict(example_text)
entity_spans = agent_prediction["entity_spans"]
# Get all pairs of entities
pairs = []
for i in range(len(entity_spans)):
    for j in range(i + 1, len(entity_spans)):
        pairs.append((entity_spans[i], entity_spans[j]))

for entity_pair in pairs:
    pair_wise_annotation = annotate_entities(example_text, entity_pair)
    stmt_prediction = indra_stmt_classifier_model.predict(pair_wise_annotation)
    print("---PREDICTION---")
    print("Entity pair prediction:")
    print(f"  {entity_pair}")

    print("Annotated text:")
    print(f"  {pair_wise_annotation}")

    print("INDRA statement prediction:")
    print(f"  {stmt_prediction['predicted_label']}")

    print("INDRA statement prediction confidence:")
    print(f"  {stmt_prediction['confidence']}")

    print("")


---PREDICTION---
Entity pair prediction:
  ({'start': 0, 'end': 3, 'text': 'MEK'}, {'start': 19, 'end': 22, 'text': 'ERK'})
Annotated text:
  <e>MEK</e> phosphorylates <e>ERK</e>
INDRA statement prediction:
  Phosphorylation
INDRA statement prediction confidence:
  0.9950677156448364



In [None]:
tokenizer = indra_stmt_classifier_model.tokenizer

encoding = tokenizer(
    text="Phosphorylation",
    text_pair=example_text,
    return_offsets_mapping=True,
    truncation=True,
    max_length=512,
    padding=False,
    add_special_tokens=True
)


In [None]:
for token, offset in zip(encoding["input_ids"], encoding["offset_mapping"]):
    print(f"Token: {tokenizer.decode(token)}, Offset: {offset}")


Token: [CLS], Offset: (0, 0)
Token: phosphorylation, Offset: (0, 15)
Token: [SEP], Offset: (0, 0)
Token: mek, Offset: (0, 3)
Token: phosphoryl, Offset: (4, 14)
Token: ##ates, Offset: (14, 18)
Token: erk, Offset: (19, 22)
Token: [SEP], Offset: (0, 0)


In [None]:
parse_and_generalize_tags("A <subj>MEK</subj> phosphorylates <obj>ERK</obj>")


('A MEK phosphorylates ERK',
 [(2, 5, 'subj'), (21, 24, 'obj')],
 'A <e>MEK</e> phosphorylates <e>ERK</e>')

In [None]:
role_spans = parse_and_generalize_tags("A <subj>MEK</subj> phosphorylates <obj>ERK</obj>")[1]


In [None]:
encoding[.keys()]


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [None]:
for x, y in zip(tokenizer.convert_ids_to_tokens(encoding['input_ids']), char_to_token_labels(tokenizer.decode(encoding['input_ids']), encoding["offset_mapping"], role_spans)):
    print(f"Token: {x}, Label: {y}")


Token: [CLS], Label: O
Token: phosphorylation, Label: I-subj
Token: [SEP], Label: O
Token: a, Label: O
Token: <e>, Label: B-subj
Token: mek, Label: O
Token: </e>, Label: O
Token: phosphoryl, Label: I-obj
Token: ##ates, Label: I-obj
Token: <e>, Label: O
Token: erk, Label: O
Token: </e>, Label: O
Token: [SEP], Label: O


In [1]:
from pathlib import Path
from transformers import AutoTokenizer
import sys
sys.path.append((Path.cwd() / "indra_agent_role_assigner").resolve().as_posix())
from indra_agent_role_assigner.preprocess import load_and_preprocess_from_raw_data, build_label_mappings, preprocess_examples_from_dataset, SpecialTokenOffsetFixTokenizer
from datasets import Dataset

# ---- Load tokenizer ----
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
special_tokenizer = SpecialTokenOffsetFixTokenizer(tokenizer)

# ---- Load raw data ----
raw_examples = load_and_preprocess_from_raw_data("/Users/thomaslim/gyorilab/indra_bert/data/indra_benchmark_annotated_data/indra_benchmark_corpus_annotated_stratified_sample.jsonl")

# ---- Build label mappings from dataset ----
label2id, id2label = build_label_mappings(raw_examples)

# ---- Convert to HuggingFace Dataset ----
dataset = Dataset.from_list(raw_examples)

split_dataset = dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = split_dataset["train"]
temp_dataset = split_dataset["test"]
val_test_split = temp_dataset.train_test_split(test_size=1/3, seed=42)
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

# ---- Preprocess for model ----
train_dataset = train_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, special_tokenizer, label2id),
    batched=False
)
val_dataset = val_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, special_tokenizer, label2id),
    batched=False
)
test_dataset = test_dataset.map(
    lambda x: preprocess_examples_from_dataset(x, special_tokenizer, label2id),
    batched=False
)


  from .autonotebook import tqdm as notebook_tqdm
Loading and preprocessing: 1800it [00:00, 110207.24it/s]
Map: 100%|██████████| 1260/1260 [00:00<00:00, 3776.76 examples/s]
Map: 100%|██████████| 360/360 [00:00<00:00, 3862.33 examples/s]
Map: 100%|██████████| 180/180 [00:00<00:00, 3586.88 examples/s]


In [None]:
import re
list(re.finditer(r"<([^<>]+?)>(.*?)</\1>", "olved in <subj>MAPK</subj> suppressio"))[0].group(1)


'subj'