In [40]:
!pip install datasets



In [41]:
from random import shuffle
from math import ceil

import torch
import torch.nn as nn

from transformers import AutoModel, AutoTokenizer
import datasets

from tqdm.auto import tqdm

from collections import defaultdict
from urllib import request
import json
import pandas as pd

In [42]:
def parse_conllu_using_pandas(block):
    records = []
    for line in block.splitlines():
        if not line.startswith('#'):
            records.append(line.strip().split('\t'))
    return pd.DataFrame.from_records(
        records,
        columns=['ID', 'FORM', 'TAG', 'Misc1', 'Misc2'])

In [43]:
def tokens_to_labels(df):
    return (
        df.FORM.tolist(),
        df.TAG.tolist()
    )

In [44]:
PREFIX = "https://raw.githubusercontent.com/UniversalNER/"
DATA_URLS = {
    "en_ewt": {
        "train": "UNER_English-EWT/master/en_ewt-ud-train.iob2",
        "dev": "UNER_English-EWT/master/en_ewt-ud-dev.iob2",
        "test": "UNER_English-EWT/master/en_ewt-ud-test.iob2"
    },
    "en_pud": {
        "test": "UNER_English-PUD/master/en_pud-ud-test.iob2"
    }
}

In [45]:
# en_ewt is the main train-dev-test split
# en_pud is the OOD test set
data_dict = defaultdict(dict)
for corpus, split_dict in DATA_URLS.items():
    for split, url_suffix in split_dict.items():
        url = PREFIX + url_suffix
        with request.urlopen(url) as response:
            txt = response.read().decode('utf-8')
            data_frames = map(parse_conllu_using_pandas,
                              txt.split('\n\n'))
            token_label_alignments = list(map(tokens_to_labels,
                                              data_frames))
            data_dict[corpus][split] = token_label_alignments

In [46]:
# Saving the data so that you don't have to redownload it each time.
with open('ner_data_dict.json', 'w', encoding='utf-8') as out:
    json.dump(data_dict, out, indent=2, ensure_ascii=False)

In [7]:
# Each subset of each corpus is a list of tuples where each tuple
# is a list of tokens with a corresponding list of labels.

# Train on data_dict['en_ewt']['train']; validate on data_dict['en_ewt']['dev']
# and test on data_dict['en_ewt']['test'] and data_dict['en_pud']['test']
data_dict['en_ewt']['train'][0], data_dict['en_pud']['test'][1]


((['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?'],
  ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']),
 (['For',
   'those',
   'who',
   'follow',
   'social',
   'media',
   'transitions',
   'on',
   'Capitol',
   'Hill',
   ',',
   'this',
   'will',
   'be',
   'a',
   'little',
   'different',
   '.'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'I-LOC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']))

In [47]:
def convert_to_word_tag_tuples(data):
    return [list(zip(words, tags)) for words, tags in data]

def convert_to_generic_bio(data):
    generic_bio_data = []
    for sentence in data:
        generic_sentence = []
        for word, tag in sentence:
            if tag == 'O':
                generic_tag = 'O'
            elif tag.startswith('B-'):
                generic_tag = 'B'
            elif tag.startswith('I-'):
                generic_tag = 'I'
            else:
                generic_tag = tag
            generic_sentence.append((word, generic_tag))
        generic_bio_data.append(generic_sentence)
    return generic_bio_data

# Apply to all splits in both corpora
for corpus in ['en_ewt', 'en_pud']:
    for split in data_dict[corpus]:
        # Step 1: Convert to (word, tag) tuples
        data_dict[corpus][split] = convert_to_word_tag_tuples(data_dict[corpus][split])
        # Step 2: Convert to generic BIO tags
        data_dict[corpus][split] = convert_to_generic_bio(data_dict[corpus][split])


In [None]:
data_dict[0]

KeyError: 0

In [10]:
def convert_to_word_tag_tuples(data):
    return [list(zip(words, tags)) for words, tags in data]


In [48]:
train_data = data_dict['en_ewt']['train']
val_data   = data_dict['en_ewt']['dev']
test_data  = data_dict['en_ewt']['test']
ood_data   = data_dict['en_pud']['test']


In [13]:
train_data[2]


[('Widely', 'O'),
 ('considered', 'O'),
 ('to', 'O'),
 ('be', 'O'),
 ('one', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('most', 'O'),
 ('spectacular', 'O'),
 ('waterfalls', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('world', 'O'),
 (',', 'O'),
 ('the', 'O'),
 ('Iguazu', 'B'),
 ('Falls', 'I'),
 ('on', 'O'),
 ('the', 'O'),
 ('border', 'O'),
 ('of', 'O'),
 ('Argentina', 'B'),
 ('and', 'O'),
 ('Brazil', 'B'),
 (',', 'O'),
 ('are', 'O'),
 ('a', 'O'),
 ('certainly', 'O'),
 ('must', 'O'),
 ('see', 'O'),
 ('attraction', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('area', 'O'),
 ('.', 'O')]

In [49]:
# Let's see how many different labels we have to set up the classification_head
# accordingly:
labels = set()
for ex in train_data:
    # We assume that there will be no new POS tags in the dev and test sets.
    labels.update([el[1] for el in ex])
n_classes = len(labels)
sorted(labels)

['B', 'I', 'O']

In [50]:
# The models expect class numbers, not strings
label_to_i = {
    label: i
    for i, label in enumerate(sorted(labels))
}
i_to_label = {
    i: label
    for label, i in label_to_i.items()
}

In [18]:
label_to_i

{'B': 0, 'I': 1, 'O': 2}

In [51]:
model_tag = 'google-bert/bert-base-uncased'

tokeniser = AutoTokenizer.from_pretrained(model_tag)

In [17]:
# Our data is pretokenised, which we can use
example_input = [el[0] for el in train_data[0]]
exmaple_output = [el[1] for el in train_data[0]]
# Note is_split_into_words
example_tokenisation = tokeniser(example_input, is_split_into_words=True)
example_tokenisation

{'input_ids': [101, 2073, 1999, 1996, 2088, 2003, 1045, 19696, 9759, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
# Notice the subwords
print(tokeniser.decode(example_tokenisation.input_ids))
for input_id in example_tokenisation.input_ids:
    print(tokeniser.decode([input_id]), end=' ')

NameError: name 'example_tokenisation' is not defined

In [52]:
class ClassificationHead(nn.Module):
    def __init__(self, model_dim=768, n_classes=3):
        super().__init__()
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(model_dim, n_classes)

    def forward(self, x):
        return self.linear(self.dropout(x))

In [54]:
def process_sentence(sentence, label_to_i, tokeniser, encoder, clf_head,
                     encoder_device, clf_head_device):
    # Handle empty sentences
    if not sentence:
        return torch.zeros((0, clf_head.linear.in_features), device=clf_head_device), \
               torch.tensor([], device=clf_head_device, dtype=torch.long)

    words = [word for word, _ in sentence]
    tokenisation = tokeniser(words, is_split_into_words=True, return_tensors='pt')

    # Handle cases where tokenization produces no input IDs
    if tokenisation.input_ids.size(1) < 2:  # Only [CLS] and [SEP]
        return torch.zeros((0, clf_head.linear.in_features), device=clf_head_device), \
               torch.tensor([], device=clf_head_device, dtype=torch.long)

    inputs = {k: v.to(encoder_device) for k, v in tokenisation.items()}
    outputs = encoder(**inputs).last_hidden_state[0, 1:-1, :]  # Skip [CLS] and [SEP]

    word_ids = tokenisation.word_ids()[1:-1]  # Align with outputs after slicing
    first_subword_embeddings = []
    processed_words = set()

    for i, word_id in enumerate(word_ids):
        if word_id is not None and word_id not in processed_words:
            first_subword_embeddings.append(outputs[i])
            processed_words.add(word_id)

    # Handle empty embeddings (no valid words)
    if not first_subword_embeddings:
        return torch.zeros((0, clf_head.linear.in_features), device=clf_head_device), \
               torch.tensor([], device=clf_head_device, dtype=torch.long)

    clf_head_inputs = torch.vstack(first_subword_embeddings).to(clf_head_device)
    gold_labels = torch.tensor([label_to_i[label] for _, label in sentence], device=clf_head_device)

    return clf_head(clf_head_inputs), gold_labels


In [55]:
def train_epoch(data, label_to_i, tokeniser, encoder, clf_head,
                encoder_device, clf_head_device, loss_fn, optimiser):
    encoder.train()
    epoch_losses = torch.empty(len(data))
    for step_n, sentence in tqdm(
        enumerate(data),
        total=len(data),
        desc='Train',
        leave=False
    ):
        if not sentence:
          continue
        else:
          optimiser.zero_grad()
          logits, gold_labels = process_sentence(
              sentence, label_to_i, tokeniser,
              encoder, clf_head, encoder_device,
              clf_head_device)
          loss = loss_fn(logits, gold_labels)
          loss.backward()
          optimiser.step()
          epoch_losses[step_n] = loss.item()
    return epoch_losses.mean().item()

In [56]:
def validate_epoch(data, label_to_i, tokeniser, encoder, clf_head,
                   encoder_device, clf_head_device):
    encoder.eval()
    epoch_accuracies = torch.empty(len(data))
    for step_n, sentence in tqdm(
        enumerate(data),
        total=len(data),
        desc='Eval',
        leave=False
    ):
        with torch.no_grad():
          if not sentence:
            epoch_accuracies[step_n] = 0.0
            continue
          logits, gold_labels = process_sentence(
              sentence, label_to_i, tokeniser,
              encoder, clf_head, encoder_device,
              clf_head_device)
        if logits.size(0) == 0 or gold_labels.size(0) == 0:
          epoch_accuracies[step_n] = 0.0
          continue
        predicted_labels = torch.argmax(logits, dim=-1)

        epoch_accuracies[step_n] = (
            predicted_labels == gold_labels).sum().item() / len(sentence)
    return epoch_accuracies.mean().item()

In [57]:
encoder_device = 0  # Can also be 'cpu'
encoder = AutoModel.from_pretrained(
    model_tag).to(encoder_device)
# NB: pass the number of different POS tags
clf_head = ClassificationHead(n_classes=n_classes)
clf_head_device = 0
clf_head.to(clf_head_device);

In [58]:
n_epochs = 4
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.AdamW(
    list(encoder.parameters()) + list(clf_head.parameters()), lr=2*10**(-5))
for epoch_n in tqdm(range(n_epochs)):
    loss = train_epoch(train_data, label_to_i, tokeniser, encoder, clf_head,
                       encoder_device, clf_head_device, loss_fn, optimiser)
    print(f'Epoch {epoch_n+1} training loss: {loss:.2f}')
    accuracy = validate_epoch(val_data, label_to_i, tokeniser, encoder,
                              clf_head, encoder_device, clf_head_device)
    print(f'Epoch {epoch_n+1} dev accuracy: {accuracy:.2f}')

  0%|          | 0/4 [00:00<?, ?it/s]

Train:   0%|          | 0/12544 [00:00<?, ?it/s]

Epoch 1 training loss: 0.06


Eval:   0%|          | 0/2002 [00:00<?, ?it/s]

Epoch 1 dev accuracy: 0.98


Train:   0%|          | 0/12544 [00:00<?, ?it/s]

Epoch 2 training loss: 0.03


Eval:   0%|          | 0/2002 [00:00<?, ?it/s]

Epoch 2 dev accuracy: 0.98


Train:   0%|          | 0/12544 [00:00<?, ?it/s]

Epoch 3 training loss: 0.02


Eval:   0%|          | 0/2002 [00:00<?, ?it/s]

Epoch 3 dev accuracy: 0.98


Train:   0%|          | 0/12544 [00:00<?, ?it/s]

Epoch 4 training loss: 0.01


Eval:   0%|          | 0/2002 [00:00<?, ?it/s]

Epoch 4 dev accuracy: 0.98


In [60]:
!pip install seqeval pandas




In [62]:
from seqeval.metrics import classification_report  # <-- ADD THIS IMPORT
def evaluate_simplified_tagset(model, tokeniser, data, label_to_i, i_to_label, encoder_device=0, clf_head_device=0):
    """Evaluates model on both span-level and token-level metrics for simplified B/I/O tags"""
    # Span processing functions
    def bio_to_spans(tags):
        """Converts B/I/O tags to spans with labels (B/I)"""
        spans = []
        current_start = None
        current_label = None
        for i, tag in enumerate(tags):
            if tag == 'B':
                if current_start is not None:
                    spans.append((current_start, i-1, current_label))
                current_start = i
                current_label = 'B'
            elif tag == 'I':
                if current_label != 'I':
                    if current_start is not None:
                        spans.append((current_start, i-1, current_label))
                    current_start = i
                current_label = 'I'
            else:  # O
                if current_start is not None:
                    spans.append((current_start, i-1, current_label))
                    current_start = None
                    current_label = None
        if current_start is not None:
            spans.append((current_start, len(tags)-1, current_label))
        return spans

    # Initialize metrics
    all_true_token = []
    all_pred_token = []
    labelled_true = []
    labelled_pred = []
    unlabelled_true = []
    unlabelled_pred = []
    label_counts = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0})

    for sentence in tqdm(data, desc="Evaluating"):
        with torch.no_grad():
            # Process sentence
            words = [word for word, _ in sentence]
            gold_tags = [i_to_label[label_to_i[tag]] for _, tag in sentence]

            # Get predictions
            logits, gold_labels = process_sentence(
                sentence, label_to_i, tokeniser,
                model['encoder'], model['clf_head'],
                encoder_device, clf_head_device
            )
            pred_tags = [i_to_label[i] for i in torch.argmax(logits, dim=-1).cpu().numpy()]

            # Token-level metrics
            all_true_token.extend(gold_tags)
            all_pred_token.extend(pred_tags)

            # Span-level processing
            true_spans = bio_to_spans(gold_tags)
            pred_spans = bio_to_spans(pred_tags)

            # Labelled spans (boundaries + B/I labels)
            labelled_true.append([(s, e, l) for (s, e, l) in true_spans])
            labelled_pred.append([(s, e, l) for (s, e, l) in pred_spans])

            # Unlabelled spans (boundaries only)
            unlabelled_true.append([(s, e) for (s, e, _) in true_spans])
            unlabelled_pred.append([(s, e) for (s, e, _) in pred_spans])

            # Per-label counts for macro F1
            for tag in gold_tags:
                if tag != 'O':
                    label_counts[tag]['fn'] += 1
            for tag in pred_tags:
                if tag != 'O':
                    label_counts[tag]['fp'] += 1
            for t_tag, p_tag in zip(gold_tags, pred_tags):
                if t_tag == p_tag and t_tag != 'O':
                    label_counts[t_tag]['tp'] += 1
                    label_counts[t_tag]['fn'] -= 1

    # Token-level metrics
    # Token-level metrics
    token_report = classification_report(
        [all_true_token],  # seqeval expects list of lists (one per sentence)
        [all_pred_token],
        mode='strict',  # Required for BIO evaluation
        output_dict=True
    )


    # Span-level metrics
    def compute_span_metrics(true, pred):
        true_flat = set((i, s, e, *rest) for i, spans in enumerate(true) for span in spans for s,e,*rest in [span])
        pred_flat = set((i, s, e, *rest) for i, spans in enumerate(pred) for span in spans for s,e,*rest in [span])
        correct = true_flat & pred_flat

        precision = len(correct)/len(pred_flat) if pred_flat else 0
        recall = len(correct)/len(true_flat) if true_flat else 0
        f1 = 2*precision*recall/(precision+recall) if (precision+recall) else 0
        return {'precision': precision, 'recall': recall, 'f1': f1}

    # Macro F1 calculation (B/I only)
    macro_f1 = sum(
        2*(label_counts[l]['tp']/(label_counts[l]['tp'] + label_counts[l]['fp']))*(label_counts[l]['tp']/(label_counts[l]['tp'] + label_counts[l]['fn']))/
        ((label_counts[l]['tp']/(label_counts[l]['tp'] + label_counts[l]['fp'])) + (label_counts[l]['tp']/(label_counts[l]['tp'] + label_counts[l]['fn'])))
        if (label_counts[l]['tp'] + label_counts[l]['fp']) > 0 and (label_counts[l]['tp'] + label_counts[l]['fn']) > 0 else 0
        for l in ['B', 'I']
    ) / 2

    return {
        'token_level': token_report,
        'span_level': {
            'labelled': compute_span_metrics(labelled_true, labelled_pred),
            'unlabelled': compute_span_metrics(unlabelled_true, unlabelled_pred)
        },
        'macro_f1': macro_f1
    }


In [63]:
results = evaluate_simplified_tagset(
    model={'encoder': encoder, 'clf_head': clf_head},
    tokeniser=tokeniser,
    data=test_data,
    label_to_i=label_to_i,
    i_to_label=i_to_label,
    encoder_device=encoder_device,
    clf_head_device=clf_head_device
)

print("Token-Level Metrics:")
print(results['token_level'])

print("\nSpan-Level Metrics:")
print("Labelled:", results['span_level']['labelled'])
print("Unlabelled:", results['span_level']['unlabelled'])

print("\nMacro F1 (B/I):", results['macro_f1'])


Evaluating:   0%|          | 0/2078 [00:00<?, ?it/s]

Token-Level Metrics:
{'_': {'precision': np.float64(0.7755775577557755), 'recall': np.float64(0.8639705882352942), 'f1-score': np.float64(0.817391304347826), 'support': np.int64(1088)}, 'micro avg': {'precision': np.float64(0.7755775577557755), 'recall': np.float64(0.8639705882352942), 'f1-score': np.float64(0.817391304347826), 'support': np.int64(1088)}, 'macro avg': {'precision': np.float64(0.7755775577557755), 'recall': np.float64(0.8639705882352942), 'f1-score': np.float64(0.817391304347826), 'support': np.int64(1088)}, 'weighted avg': {'precision': np.float64(0.7755775577557755), 'recall': np.float64(0.8639705882352942), 'f1-score': np.float64(0.817391304347826), 'support': np.int64(1088)}}

Span-Level Metrics:
Labelled: {'precision': 0.7885646217986897, 'recall': 0.8927848954821308, 'f1': 0.8374446552814674}
Unlabelled: {'precision': 0.7903513996426444, 'recall': 0.894807821982468, 'f1': 0.8393421884882986}

Macro F1 (B/I): 0.585245170145587


In [27]:
results = evaluate_simplified_tagset(
    model={'encoder': encoder, 'clf_head': clf_head},
    tokeniser=tokeniser,
    data=ood_data,
    label_to_i=label_to_i,
    i_to_label=i_to_label,
    encoder_device=encoder_device,
    clf_head_device=clf_head_device
)

print("Token-Level Metrics:")
print(pd.DataFrame(results['token_level']).transpose())

print("\nSpan-Level Metrics:")
print("Labelled:", results['span_level']['labelled'])
print("Unlabelled:", results['span_level']['unlabelled'])

print("\nMacro F1 (B/I):", results['macro_f1'])


Evaluating:   0%|          | 0/1001 [00:00<?, ?it/s]

Token-Level Metrics:
              precision    recall  f1-score  support
_              0.860531  0.843721  0.852043   1075.0
micro avg      0.860531  0.843721  0.852043   1075.0
macro avg      0.860531  0.843721  0.852043   1075.0
weighted avg   0.860531  0.843721  0.852043   1075.0

Span-Level Metrics:
Labelled: {'precision': 0.867595818815331, 'recall': 0.870020964360587, 'f1': 0.8688066992323795}
Unlabelled: {'precision': 0.8794425087108014, 'recall': 0.8819007686932215, 'f1': 0.8806699232379623}

Macro F1 (B/I): 0.5984456476272972


In [39]:
results

NameError: name 'results' is not defined