# Introduction Notebook

This notebook covers the main tools that will be used during our project with an example usage.

1. LLM interaction (NER + Clarification)
2. Sequence updating with entities + clarifications
3. Prepare dataset for T5


In [28]:
import ollama
import re
import json

## 1. LLM interaction (NER + Clarification)


In [29]:
def query_ollama(sentence: str, model: str = "llama3.1:latest") -> list:
    
    """Query LLM to extract entities in JSON format (as list)."""

    request = (
        "Please generate one list with all entities from the following text "
        "in JSON format, excluding numbers. Do not format the JSON output. "
        + sentence
    )
    response = ollama.chat(model=model, messages=[{"role": "user", "content": request}])
    output = response['message']['content']
    # Extract JSON array
    match = re.search(r'\[.*?\]', output, re.DOTALL)
    if match:
        try:
            entities = json.loads(match.group(0))
            return entities
        except json.JSONDecodeError:
            return []
    return []

In [30]:
def clarify_entity(entity: str, context: str, model: str = "llama3.1:latest") -> str:
    
    """Ask LLM to provide a 2-3 sentence description for the entity based on context."""

    request = (
        f"Just expand the following entity mention '{entity}' to a description "
        f"(2-3 sentences) based on context. Context: {context}"
    )
    response = ollama.chat(model=model, messages=[{"role": "user", "content": request}])
    return response['message']['content']

## 2. Sequence updating with entities + clarifications


In [31]:
def update_sequence_with_entities(sequence: str, clarify_entities: dict) -> str:
    """Insert [START_ENT]... [END_ENT] for all entities."""
    if not clarify_entities:
        return sequence

    escaped_keys = [re.escape(k) for k in clarify_entities.keys()]
    pattern = re.compile(r'\b(' + "|".join(escaped_keys) + r')\b', re.IGNORECASE)

    def repl(match):
        entity_text = match.group(0)
        return f"[START_ENT] {entity_text} [END_ENT]"

    return pattern.sub(repl, sequence)

def update_sequence_with_entities_clarify(sequence: str, clarify_entities: dict) -> str:
    """Insert [START_ENT]... [END_ENT][CLARIFY: ...] for all entities."""
    if not clarify_entities:
        return sequence

    escaped_keys = [re.escape(k) for k in clarify_entities.keys()]
    pattern = re.compile(r'\b(' + "|".join(escaped_keys) + r')\b', re.IGNORECASE)

    def repl(match):
        entity_text = match.group(0)
        # find the key in original dict (case-insensitive)
        real_key = next(k for k in clarify_entities if k.lower() == entity_text.lower())
        return f"[START_ENT] {entity_text} [END_ENT][CLARIFY: {clarify_entities[real_key]}]"

    return pattern.sub(repl, sequence)

## Example Usage

In [36]:
text = "Angelina met her partner Brad and her father Jon in AK"

# Step 1: extract entities
entities = query_ollama(text)
print("Extracted entities:", entities)

# Step 2: clarify each entity
clarify_entities_dict = {entity: clarify_entity(entity, text) for entity in entities}
print("Clarifications:", clarify_entities_dict)

# Step 3: update sequence with entities
augmented_text = update_sequence_with_entities(text, clarify_entities_dict)
print("Augmented text:", augmented_text)

# Step 3: update sequence with clarifications
clarify_augmented_text = update_sequence_with_entities_clarify(text, clarify_entities_dict)
print("Augmented text:", clarify_augmented_text)


2025-11-17 02:47:35,555 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Extracted entities: ['Angelina', 'Brad', 'Jon', 'AK']


2025-11-17 02:47:55,316 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-11-17 02:48:04,965 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-11-17 02:48:18,551 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-11-17 02:48:31,758 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Clarifications: {'Angelina': 'Here is an expansion of the entity mention "Angelina" into a brief description:\n\nAngelina Jolie, an actress known for her roles in films like "Tomb Raider", "Mr. & Mrs. Smith", and "Maleficent". She is often spotted in high-profile events with her family, including her partner Brad Pitt and children from previous relationships. In this context, she is likely visiting Anchorage, Alaska (AK) to spend quality time with her loved ones.', 'Brad': 'Here is a possible expansion of the entity mention "Brad" to a description based on the provided context:\n\n"Angelina\'s partner Brad, an American actor, producer, and director, was accompanying her and her father Jon on their trip to Alaska."', 'Jon': 'Here\'s an expanded description of the entity "Jon" based on the given context:\n\nJon is likely Angelina Jolie\'s father, Jon Voight, who was a renowned actor known for his versatility and iconic roles. As a renowned figure in Hollywood, he has been involved in var

## 3. Prepare dataset for T5


In [37]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class JointDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=128):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        encoding = self.tokenizer(
            sample["input_text"],
            text_target=sample["target_text"],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        # Flatten tensors
        item = {key: val.squeeze() for key, val in encoding.items()}
        return item


In [38]:
train_samples = [
    # NER task: input is plain text + target_ner
    {
        "input_text": "Angelina met Brad in AK. target_ner",
        "target_text": "[START_ENT] Angelina [END_ENT] met [START_ENT] Brad [END_ENT] in [START_ENT] AK [END_ENT]"
    },
    # EL task: input is annotated text + target_el
    {
        "input_text": "[START_ENT] Angelina [END_ENT] met [START_ENT] Brad [END_ENT] in [START_ENT] AK [END_ENT]. target_el",
        "target_text": "[START_ENT] Angelina [END_ENT][ http://en.wikipedia.org/wiki/Angelina_Jolie ] met [START_ENT] Brad [END_ENT][ http://en.wikipedia.org/wiki/Brad_Pitt ] in [START_ENT] AK [END_ENT][ http://en.wikipedia.org/wiki/Alaska ]"
    },
    {
        "input_text": "Jon is Brad's father. target_ner",
        "target_text": "[START_ENT] Jon [END_ENT] is [START_ENT] Brad [END_ENT]'s father"
    },
    {
        "input_text": "[START_ENT] Jon [END_ENT] is [START_ENT] Brad [END_ENT]'s father. target_el",
        "target_text": "[START_ENT] Jon [END_ENT][ http://en.wikipedia.org/wiki/Jon_Sample ] is [START_ENT] Brad [END_ENT][ http://en.wikipedia.org/wiki/Brad_Pitt ]'s father"
    }
]

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

dataset = JointDataset(train_samples, tokenizer)

training_args = TrainingArguments(
    output_dir="./t5_joint_demo",
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=1,
    logging_steps=1,
    save_steps=10,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=5e-5,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()



Step,Training Loss
1,15.1339
2,8.2127
3,14.6813
4,5.4656
5,7.7905
6,4.3551
7,5.7984
8,8.4053
9,7.0169
10,5.4654




TrainOutput(global_step=200, training_loss=1.9187075981497765, metrics={'train_runtime': 161.5745, 'train_samples_per_second': 1.238, 'train_steps_per_second': 1.238, 'total_flos': 6767090073600.0, 'train_loss': 1.9187075981497765, 'epoch': 50.0})

In [42]:
test_sentence_ner = "Angelina met Jon in AK. target_ner"
inputs = tokenizer(test_sentence_ner, return_tensors="pt")
outputs = model.generate(**inputs, max_length=128)
print("NER Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

test_sentence_el = "[START_ENT] Angelina [END_ENT] met [START_ENT] Jon [END_ENT] in [START_ENT] AK [END_ENT]. target_el"
inputs = tokenizer(test_sentence_el, return_tensors="pt")
outputs = model.generate(**inputs, max_length=128)
print("EL Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

NER Output: :     Jon remet met Jon Jon in AK :
EL Output: [START_ENT] Angelina [END_ENT] Angelina [ [ [START_ENT] Angelina [ [START_ENT] [START_ENT] Angelina [END_ENT] [[
