# **NER with T5 Encoder**

In [None]:
!pip install transformers datasets sentencepiece accelerate peft -q

In [25]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5Tokenizer, T5EncoderModel
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification, AdamW
from tqdm import tqdm

In [3]:
checkpoint = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [4]:
my_checkpoint = "danfarh2000/text-summarization-T5"
model.load_adapter(my_checkpoint)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(my_checkpoint)
encoder = T5EncoderModel.from_pretrained(my_checkpoint)

Loading adapter weights from danfarh2000/text-summarization-T5 led to unexpected keys not found in the model:  ['decoder.block.0.layer.0.SelfAttention.q.lora_A.default.weight', 'decoder.block.0.layer.0.SelfAttention.q.lora_B.default.weight', 'decoder.block.0.layer.0.SelfAttention.v.lora_A.default.weight', 'decoder.block.0.layer.0.SelfAttention.v.lora_B.default.weight', 'decoder.block.0.layer.1.EncDecAttention.q.lora_A.default.weight', 'decoder.block.0.layer.1.EncDecAttention.q.lora_B.default.weight', 'decoder.block.0.layer.1.EncDecAttention.v.lora_A.default.weight', 'decoder.block.0.layer.1.EncDecAttention.v.lora_B.default.weight', 'decoder.block.1.layer.0.SelfAttention.q.lora_A.default.weight', 'decoder.block.1.layer.0.SelfAttention.q.lora_B.default.weight', 'decoder.block.1.layer.0.SelfAttention.v.lora_A.default.weight', 'decoder.block.1.layer.0.SelfAttention.v.lora_B.default.weight', 'decoder.block.1.layer.1.EncDecAttention.q.lora_A.default.weight', 'decoder.block.1.layer.1.EncDecAt

In [6]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [8]:
print(dataset['train'][0])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [9]:
label_map = {
    0: "O", # Outside of a named entity
    1: "B-PER", # Beginning of a person entity
    2: "I-PER", # Inside a person entity
    3: "B-ORG", # Beginning of an organization entity
    4: "I-ORG", # Inside an organization entity
    5: "B-LOC", # Beginning of a location entity
    6: "I-LOC", # Inside a location entity
    7: "B-MISC", # Beginning of a miscellaneous entity
    8: "I-MISC" # Inside a miscellaneous entity
}

num_labels = len(label_map)

In [10]:
num_labels

9

In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        # Get the word IDs for the tokenized input
        word_ids = []
        for word_idx in tokenized_inputs.word_ids(i):
            if word_idx is not None:
                word_ids.append(word_idx)
            else:
                word_ids.append(-100)  # Special token (e.g., [CLS], [SEP], [PAD])

        # Align labels with tokens
        label_ids = []
        current_word_idx = None
        for word_idx in word_ids:
            if word_idx == -100:
                label_ids.append(-100)  # Special token, ignore in loss
            elif word_idx != current_word_idx:
                # New word, assign the corresponding label
                label_ids.append(label[word_idx])
                current_word_idx = word_idx
            else:
                # Same word, assign -100 (ignore in loss)
                label_ids.append(-100)

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

In [13]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [14]:
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

## **Custom NER model**

In [20]:
import torch
import torch.nn as nn

class NERT5EncoderModel(nn.Module):
    def __init__(self, encoder, num_labels):
        super(NERT5EncoderModel, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Linear(encoder.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        logits = self.classifier(sequence_output)
        return logits

In [21]:
num_labels = len(dataset["train"].features["ner_tags"].feature.names)
model = NERT5EncoderModel(encoder, num_labels)

In [27]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [28]:
train_dataset = tokenized_dataset["train"]
train_dataset.set_format("torch")
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)

In [29]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



NERT5EncoderModel(
  (encoder): T5EncoderModel(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): lora.Linear(
                  (base_layer): Linear(in_features=512, out_features=512, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=512, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=512, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
                  (lora_magnitude_vector): M

## **Train the model**

In [30]:
from tqdm import tqdm

for epoch in range(10):
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}", leave=False)
    
    total_loss = 0 
    num_batches = len(train_dataloader)
    
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        loss = nn.CrossEntropyLoss()(outputs.view(-1, num_labels), batch["labels"].view(-1))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        
        progress_bar.set_postfix({"batch_loss": loss.item()})
    
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss:.4f}")

                                                                              

Epoch 1 completed. Average Loss: 1.4515


                                                                              

Epoch 2 completed. Average Loss: 0.7484


                                                                              

Epoch 3 completed. Average Loss: 0.5244


                                                                              

Epoch 4 completed. Average Loss: 0.4129


                                                                              

Epoch 5 completed. Average Loss: 0.3427


                                                                              

Epoch 6 completed. Average Loss: 0.2974


                                                                               

Epoch 7 completed. Average Loss: 0.2639


                                                                               

Epoch 8 completed. Average Loss: 0.2407


                                                                               

Epoch 9 completed. Average Loss: 0.2245


                                                                                

Epoch 10 completed. Average Loss: 0.2126




## **Evaluation & Test**

In [31]:
# Tokenize the test dataset
test_dataset = tokenized_dataset["test"]
test_dataset.set_format("torch")

test_dataloader = DataLoader(
    test_dataset,
    batch_size=8,
    collate_fn=data_collator, 
)

In [32]:
test_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7d2490c1ed40>

In [33]:
# Set the model to evaluation mode
model.eval()

predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        
        logits = outputs.detach().cpu().numpy()
        preds = np.argmax(logits, axis=2)

        predictions.extend(preds)
        true_labels.extend(batch["labels"].cpu().numpy())

In [34]:
from sklearn.metrics import classification_report

flat_predictions = [p for sublist in predictions for p in sublist]
flat_true_labels = [l for sublist in true_labels for l in sublist]

filtered_predictions = []
filtered_true_labels = []
for pred, label in zip(flat_predictions, flat_true_labels):
    if label != -100:  # Ignore padding tokens
        filtered_predictions.append(pred)
        filtered_true_labels.append(label)

label_names = dataset["train"].features["ner_tags"].feature.names

report = classification_report(
    filtered_true_labels,
    filtered_predictions,
    target_names=label_names,
    zero_division=0, 
)

print(report)

              precision    recall  f1-score   support

           O       0.97      1.00      0.98     38323
       B-PER       0.91      0.89      0.90      1617
       I-PER       0.92      0.97      0.94      1156
       B-ORG       0.70      0.58      0.64      1661
       I-ORG       0.72      0.50      0.59       835
       B-LOC       0.77      0.80      0.78      1668
       I-LOC       0.58      0.28      0.37       257
      B-MISC       0.82      0.56      0.66       702
      I-MISC       0.91      0.20      0.33       216

    accuracy                           0.95     46435
   macro avg       0.81      0.64      0.69     46435
weighted avg       0.94      0.95      0.94     46435



In [35]:
sentence = "Apple is looking to buy a startup in San Francisco for $1 billion."

inputs = tokenizer(
    sentence,
    return_tensors="pt", 
    truncation=True,      
    is_split_into_words=False,
)

inputs = {k: v.to(device) for k, v in inputs.items()}

model.eval() 
with torch.no_grad():
    outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    logits = outputs.detach().cpu().numpy()

predicted_labels = np.argmax(logits, axis=2)
predicted_entities = [label_names[label] for label in predicted_labels[0]]
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

for token, entity in zip(tokens, predicted_entities):
    print(f"Token: {token}, Predicted Entity: {entity}")

Token: ▁Apple, Predicted Entity: B-ORG
Token: ▁is, Predicted Entity: O
Token: ▁looking, Predicted Entity: O
Token: ▁to, Predicted Entity: O
Token: ▁buy, Predicted Entity: O
Token: ▁, Predicted Entity: O
Token: a, Predicted Entity: O
Token: ▁startup, Predicted Entity: O
Token: ▁in, Predicted Entity: O
Token: ▁San, Predicted Entity: B-LOC
Token: ▁Francisco, Predicted Entity: I-LOC
Token: ▁for, Predicted Entity: O
Token: ▁$1, Predicted Entity: O
Token: ▁billion, Predicted Entity: O
Token: ., Predicted Entity: O
Token: </s>, Predicted Entity: O
