# Introduction Notebook

This notebook covers the main tools that will be used during our project with an example usage.

1. LLM interaction (NER + Clarification)
2. Sequence updating with entities + clarifications
3. Prepare dataset for T5


In [8]:
import ollama
import re
import json

## 1. LLM interaction (NER + Clarification)
- in the original paper there was use model **llama3:70b**
- for showing the usage of the tool there will be present of example on model **llama3.1:latest**

In [9]:
# 1 Step - install Ollama from the website https://ollama.com

# 2 Step - install a model from the terminal
!ollama pull llama3.1:latest

# 3 Step - check the correctness of installation of specific model
!ollama list

[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 667b0c1932bc: 100% ▕██████████████████▏ 4.9 GB                         [K
pulling 948af2743fc7: 100% ▕██████████████████▏ 1.5 KB                         [K
pulling 0ba8f0e314b4: 100% ▕██████████████████▏  12 KB                         [K
pulling 56bb8bd477a5: 100% ▕██████████████████▏   96 B                         [K
pulling 455f34728c9b: 100% ▕██████████████████▏  487 B                         [K
verifying sha256 digest [K
writing manifest [K
success [K[?25h[?2026l


NAME               ID              SIZE      MODIFIED               
llama3.1:latest    46e0c10c039e    4.9 GB    Less than a second ago    


In [10]:
def query_ollama(sentence: str, model: str = "llama3.1:latest") -> list:
    
    """Query LLM to extract entities in JSON format (as list)."""

    request = (
        "Please generate one list with all entities from the following text "
        "in JSON format, excluding numbers. Do not format the JSON output. "
        + sentence
    )
    response = ollama.chat(model=model, messages=[{"role": "user", "content": request}])
    output = response['message']['content']
    # Extract JSON array
    match = re.search(r'\[.*?\]', output, re.DOTALL)
    if match:
        try:
            entities = json.loads(match.group(0))
            return entities
        except json.JSONDecodeError:
            return []
    return []

In [11]:
def clarify_entity(entity: str, context: str, model: str = "llama3.1:latest") -> str:
    
    """Ask LLM to provide a 2-3 sentence description for the entity based on context."""

    request = (
        f"Just expand the following entity mention '{entity}' to a description "
        f"(2-3 sentences) based on context. Context: {context}"
    )
    response = ollama.chat(model=model, messages=[{"role": "user", "content": request}])
    return response['message']['content']

## 2. Sequence updating with entities + clarifications


In [12]:
def update_sequence_with_entities(sequence: str, clarify_entities: dict) -> str:
    """Insert [START_ENT]... [END_ENT] for all entities."""
    if not clarify_entities:
        return sequence

    escaped_keys = [re.escape(k) for k in clarify_entities.keys()]
    pattern = re.compile(r'\b(' + "|".join(escaped_keys) + r')\b', re.IGNORECASE)

    def repl(match):
        entity_text = match.group(0)
        return f"[START_ENT] {entity_text} [END_ENT]"

    return pattern.sub(repl, sequence)

def update_sequence_with_entities_clarify(sequence: str, clarify_entities: dict) -> str:
    """Insert [START_ENT]... [END_ENT][CLARIFY: ...] for all entities."""
    if not clarify_entities:
        return sequence

    escaped_keys = [re.escape(k) for k in clarify_entities.keys()]
    pattern = re.compile(r'\b(' + "|".join(escaped_keys) + r')\b', re.IGNORECASE)

    def repl(match):
        entity_text = match.group(0)
        # find the key in original dict (case-insensitive)
        real_key = next(k for k in clarify_entities if k.lower() == entity_text.lower())
        return f"[START_ENT] {entity_text} [END_ENT][CLARIFY: {clarify_entities[real_key]}]"

    return pattern.sub(repl, sequence)

## Example Usage

In [18]:
text = "Angelina met her partner Brad and her father Jon which is a pastor in AK "

# Step 1: extract entities
entities = query_ollama(text)
entities= ['Angelina', 'Brad', 'Jon', 'AK']  # for demonstration purposes
print("Extracted entities:", entities)

# Step 2: clarify each entity
clarify_entities_dict = {entity: clarify_entity(entity, text) for entity in entities}
print("Clarifications:", clarify_entities_dict)

# Step 3: update sequence with entities
augmented_text = update_sequence_with_entities(text, clarify_entities_dict)
print("Augmented text:", augmented_text)

# Step 3: update sequence with clarifications
clarify_augmented_text = update_sequence_with_entities_clarify(text, clarify_entities_dict)
print("Augmented text:", clarify_augmented_text)


Extracted entities: ['Angelina', 'Brad', 'Jon', 'AK']
Clarifications: {'Angelina': 'Here\'s an expanded description of "Angelina" based on the provided context:\n\nAngelina Jolie, humanitarian and philanthropist, met her longtime partner, actor Brad Pitt, while introducing herself as a mother of six to the press. Her father, Jon Voight, being a pastor himself, had likely instilled in her strong Christian values from an early age. As a well-known actress and activist, Angelina\'s compassionate nature has led her to advocate for women\'s rights and refugee causes around the world.', 'Brad': 'Here\'s an expanded description of "Brad" based on the provided context:\n\nBrad, presumably Angelina\'s partner, appears to be a significant figure in her life, as she has likely introduced him to her family. As he accompanies Angelina to meet her father, it suggests that their relationship is established and possibly even familial, given the informal introduction.', 'Jon': 'Here\'s an expanded desc

## 3. Prepare dataset for T5


In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class JointDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=128):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        encoding = self.tokenizer(
            sample["input_text"],
            text_target=sample["target_text"],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        # Flatten tensors
        item = {key: val.squeeze() for key, val in encoding.items()}
        return item


In [15]:
train_samples = [
    # NER task: input is plain text + target_ner
    {
        "input_text": "Angelina met Brad in AK. target_ner",
        "target_text": "[START_ENT] Angelina [END_ENT][CLARIFY: An actress] met [START_ENT] Brad [END_ENT][CLARIFY: Brad Pitt is Angelina's former partner, whom she married from 2014 to 2019] in [START_ENT] AK [END_ENT][CLARIFY: Alaska]"
    },
    # EL task: input is annotated text + target_el
    {
        "input_text": "[START_ENT] Angelina [END_ENT][CLARIFY: An actress] met [START_ENT] Brad [END_ENT][CLARIFY: Brad Pitt is Angelina's former partner, whom she married from 2014 to 2019] in [START_ENT] AK [END_ENT][CLARIFY: Alaska] target.el",
        "target_text": "Angelina Jolie met Brad Pitt in Alaska"
    }
]

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

dataset = JointDataset(train_samples, tokenizer)

training_args = TrainingArguments(
    output_dir="./t5_joint_demo",
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=1,
    logging_steps=1,
    save_steps=10,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=5e-5,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Step,Training Loss
1,8.8962
2,13.108
3,7.6796
4,10.7537
5,6.5185
6,9.0947
7,9.6583
8,5.7733
9,7.9682
10,5.707




TrainOutput(global_step=100, training_loss=3.1856230801343917, metrics={'train_runtime': 117.1267, 'train_samples_per_second': 0.854, 'train_steps_per_second': 0.854, 'total_flos': 3383545036800.0, 'train_loss': 3.1856230801343917, 'epoch': 50.0})

In [24]:
test_sentence_ner = "Angelina met Jon in AK. target_ner"
inputs = tokenizer(test_sentence_ner, return_tensors="pt")
outputs = model.generate(**inputs, max_length=128)
print("NER Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

test_sentence_el = "[START_ENT] Angelina [END_ENT][CLARIFY: An actress] met [START_ENT] Brad [END_ENT][CLARIFY: Brad Pitt is Angelina's former partner, whom she married from 2014 to 2019] in [START_ENT] AK [END_ENT][CLARIFY: Alaska] target.el"
inputs = tokenizer(test_sentence_el, return_tensors="pt")
outputs = model.generate(**inputs, max_length=128)
print("EL Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

NER Output: Angerina Angelina met Jon Jon in AK
EL Output: AK  AK (AK)
