#RE Pipeline based on BERT
This pipeline makes use of the functionality outlined in https://arxiv.org/pdf/1904.05255

## Installs

In [1]:
 !pip install transformers datasets torch



## Imports

In [2]:
import json
import torch
import itertools
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
from huggingface_hub import login
import os


HF_TOKEN = os.environ.get("HF_TOKEN") or "<insert hugging face token here>"
login(token=HF_TOKEN)

## Load Sample Data /NER Output
In this case mock.json -> needs to be uploaded in collab

In [None]:
SAMPLE_PATH = "/content/mock.json"  # Replace with your uploaded file path

with open(SAMPLE_PATH, "r") as f:
    data = json.load(f)

Sample NER data needs extra preprocessing to fit into dict data structure with min. two entities per sentence

In [6]:

import json, re

def split_sentences(text):
    """
    Splits on punctuation (.!?), keeping the delimiter, then strips.
    """
    # Lookbehind for ., ! or ?, then whitespace
    sents = re.split(r'(?<=[\.!?])\s+', text.strip())
    return [s for s in sents if s]

# 2. Transform the new NER format into your RE format
def ner_to_re_input(ner_path):
    """
    Reads a JSON file of documents:
      [ { "doc": "...", "entities": [ { "word": "...", "start":., "end":. }, ... ]}, ... ]
    Returns:
      [ {"sentence": "...", "entities": ["E1","E2",...]}, ... ]
    """
    with open(ner_path, "r") as f:
        docs = json.load(f)

    re_data = []
    for doc in docs:
        text = doc.get("doc", "")
        ents = doc.get("entities", [])
        # split into sentences
        for sent in split_sentences(text):
            start_idx = text.find(sent)
            end_idx   = start_idx + len(sent)
            # collect all entity words whose spans fall fully inside this sentence
            in_sent = []
            for e in ents:
                if e["start"] >= start_idx and e["end"] <= end_idx:
                    in_sent.append(e["word"])
            # dedupe, preserve order
            entities = list(dict.fromkeys(in_sent))
            if len(entities) >= 2:
                re_data.append({
                    "sentence": sent,
                    "entities": entities
                })
    return re_data

Apply to ner data

In [14]:
ner_path = "/content/testnewformat.json"  # adjust if needed
data = ner_to_re_input(ner_path)
print(data)

[]


##TACRED labels

In [8]:
label_list = [
    "no_relation",
    "org:alternate_names", "org:city_of_headquarters", "org:country_of_headquarters", "org:dissolved",
    "org:founded", "org:founded_by", "org:member_of", "org:members", "org:number_of_employees/members",
    "org:parents", "org:stateorprovince_of_headquarters", "org:subsidiaries", "org:top_members/employees",
    "org:website",
    "per:age", "per:alternate_names", "per:children", "per:cities_of_residence", "per:city_of_birth",
    "per:city_of_death", "per:countries_of_residence", "per:country_of_birth", "per:country_of_death",
    "per:date_of_birth", "per:date_of_death", "per:employee_of", "per:origin", "per:other_family",
    "per:parents", "per:religion", "per:schools_attended", "per:siblings", "per:spouse",
    "per:stateorprovince_of_birth", "per:stateorprovince_of_death", "per:stateorprovinces_of_residence",
    "per:title"
]
rel2id = {label: i for i, label in enumerate(label_list)}
id2rel = {i: label for label, i in rel2id.items()}

## Tokenizer

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Entity Marking and Tokenization

In [10]:
def mark_entities(sentence, e1, e2, subj_type="PER", obj_type="ORG"):
    s_mark = f"SUBJ-{subj_type}"
    o_mark = f"OBJ-{obj_type}"
    s_sent = sentence.replace(e1, s_mark)
    return s_sent.replace(e2, o_mark)

def prepare_input(sentence, subj, obj):
    encoded = tokenizer(
        f"[CLS] {sentence} [SEP] {subj} [SEP] {obj} [SEP]",
        return_tensors="pt", padding="max_length", max_length=128, truncation=True
    )
    return encoded["input_ids"].to(device), encoded["attention_mask"].to(device)

## BERT Classifier

In [11]:
class BERTRelationClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS]
        return self.classifier(cls_output)

model = BERTRelationClassifier(num_labels=len(label_list)).to(device)
model.eval()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTRelationClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

## Run

In [12]:
results = []
for sample in data:
    sentence = sample["sentence"]
    entities = sample["entities"]
    entry = {
        "sentence": sentence,
        "relations": []
    }

    for e1, e2 in itertools.permutations(entities, 2):
        marked = mark_entities(sentence, e1, e2)
        input_ids, attention_mask = prepare_input(marked, e1, e2)
        with torch.no_grad():
            logits = model(input_ids, attention_mask)
            pred_id = torch.argmax(logits, dim=1).item()
            pred_rel = id2rel[pred_id]
            entry["relations"].append({
                "head": e1,
                "tail": e2,
                "relation": pred_rel
            })

    results.append(entry)

## Save file

In [13]:
with open("bert_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("✅ Results saved to results.json")


✅ Results saved to results.json


# RE Papeline based on Spanbert pretrained on Tacred

## Imports

In [None]:
import json
import itertools
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Model

In [None]:
model_name = "mrm8488/spanbert-large-finetuned-tacred"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mrm8488/spanbert-large-finetuned-tacred and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

## Define TACRED labels (workaround)

In [None]:
label_list = [
    "no_relation", "org:alternate_names", "org:city_of_headquarters", "org:country_of_headquarters",
    "org:dissolved", "org:founded", "org:founded_by", "org:member_of", "org:members",
    "org:number_of_employees/members", "org:parents", "org:stateorprovince_of_headquarters",
    "org:subsidiaries", "org:top_members/employees", "org:website", "per:age", "per:alternate_names",
    "per:children", "per:cities_of_residence", "per:city_of_birth", "per:city_of_death",
    "per:countries_of_residence", "per:country_of_birth", "per:country_of_death", "per:date_of_birth",
    "per:date_of_death", "per:employee_of", "per:origin", "per:other_family", "per:parents",
    "per:religion", "per:schools_attended", "per:siblings", "per:spouse", "per:stateorprovince_of_birth",
    "per:stateorprovince_of_death", "per:stateorprovinces_of_residence", "per:title"
]

## Load sample data

In [None]:
SAMPLE_PATH = "/content/mock.json"  # Adjust path after uploading in Colab
with open(SAMPLE_PATH, "r") as f:
    data = json.load(f)

## Helper: Insert tags

In [None]:
def insert_entity_markers(sentence, subj, obj):
    # Order matters – mark longer first to avoid position errors
    sorted_entities = sorted([(subj, "[E1]", "[/E1]"), (obj, "[E2]", "[/E2]")], key=lambda x: -len(x[0]))
    for ent, start_tag, end_tag in sorted_entities:
        if ent not in sentence:
            return None  # Skip if entity not found in sentence
        sentence = sentence.replace(ent, f"{start_tag} {ent} {end_tag}", 1)
    return sentence

## Run

In [None]:
results = []
for sample in data:
    sentence = sample["sentence"]
    entities = sample["entities"]
    entry = {
        "sentence": sentence,
        "relations": []
    }

    for e1, e2 in itertools.permutations(entities, 2):
        marked_sentence = insert_entity_markers(sentence, e1, e2)
        if not marked_sentence:
            continue  # skip if entities not found

        inputs = tokenizer(
            marked_sentence,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        with torch.no_grad():
            logits = model(**inputs).logits
            probs = softmax(logits, dim=1)
            pred_label_id = torch.argmax(probs, dim=1).item()
            pred_label = label_list[pred_label_id]
            confidence = round(probs[0][pred_label_id].item(), 4)

        entry["relations"].append({
            "head": e1,
            "tail": e2,
            "relation": pred_label,
            "confidence": confidence
        })

    results.append(entry)

## Save output

In [None]:
with open("spanbert_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("✅ Inference complete. Results saved to spanbert_results.json.")


✅ Inference complete. Results saved to spanbert_results.json.
