![POS Tagging Pipeline](./assets/images/pos_tagging_pipeline.png)


# 1. Load Data

In [1]:
from typing import List, Dict
import torch
import torch.nn as nn
import numpy as np
import evaluate
from sklearn.model_selection import train_test_split
import nltk
nltk.download('treebank')

tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(f'Number of samples: {len(tagged_sentences)}')

sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append(list(tags))

print(f'Number of sentences: {len(sentences)}')
print('Sample sentence: ', sentences[0])
print('Sample tags: ', sentence_tags[0])



[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Number of samples: 3914
Number of sentences: 3914
Sample sentence:  ['pierre', 'vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.']
Sample tags:  ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']


# 2. Preprocess Data


In [None]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size=0.3, random_state=42
)
val_sentences, test_sentences, val_tags, test_tags = train_test_split(
    test_sentences, test_tags, test_size=0.5, random_state=42
)

print(f'Number of training samples: {len(train_sentences)}')
print(f'Number of test samples: {len(test_sentences)}')
print(f'Number of validation samples: {len(val_sentences)}')


Number of training samples: 2739
Number of test samples: 588
Number of validation samples: 587


In [3]:
# tokenize sentences
from transformers import AutoTokenizer
from torch.utils.data import Dataset

model_name = 'QCRI/bert-base-multilingual-cased-pos-english'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
MAX_LEN = 256
PAD_TOKEN = '[PAD]'

In [4]:
class POSTaggingDataset(Dataset):
    def __init__(
        self, 
        sentences: List[List[str]], 
        tags: List[List[str]], 
        tokenizer: AutoTokenizer, 
        label2id: Dict[str, int],
        max_len: int = MAX_LEN,
    ):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tags = self.tags[idx]

        input_token = self.tokenizer.convert_tokens_to_ids(sentence)
        attention_mask = [1] * len(input_token)
        labels = [self.label2id[tag] for tag in tags]
        
        return {
            'input_ids': self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
            'labels': self.pad_and_truncate(labels, pad_id=self.label2id[PAD_TOKEN]),
            'attention_mask': self.pad_and_truncate(attention_mask, pad_id=0),
        }
        
    def pad_and_truncate(self, input_token: List[int], pad_id: int):
        if len(input_token) >= self.max_len:
            input_token = input_token[:self.max_len]
        else:
            input_token.extend([pad_id] * (self.max_len - len(input_token)))
        return torch.as_tensor(input_token)


In [5]:
unique_tags = set(tag for tags in sentence_tags for tag in tags)
label2id = {tag: i for i, tag in enumerate(unique_tags)}
label2id.update({PAD_TOKEN: len(label2id)})
id2label = {i: tag for tag, i in label2id.items()}
unique_tags.add(PAD_TOKEN)
print(len(label2id))
print(label2id)

train_dataset = POSTaggingDataset(
    train_sentences, train_tags, tokenizer, label2id
)
test_dataset = POSTaggingDataset(
    test_sentences, test_tags, tokenizer, label2id
)
val_dataset = POSTaggingDataset(
    val_sentences, val_tags, tokenizer, label2id
)



47
{'NNP': 0, 'MD': 1, '-NONE-': 2, 'VBP': 3, "''": 4, 'JJ': 5, 'LS': 6, 'PRP': 7, '#': 8, 'WDT': 9, '``': 10, 'DT': 11, 'POS': 12, 'NNS': 13, 'IN': 14, 'WP': 15, 'JJR': 16, 'NN': 17, 'VBN': 18, 'FW': 19, 'JJS': 20, 'PRP$': 21, 'VBD': 22, ':': 23, 'WP$': 24, '-LRB-': 25, 'TO': 26, 'WRB': 27, 'NNPS': 28, 'VB': 29, 'EX': 30, 'CC': 31, 'RB': 32, 'RBS': 33, 'VBZ': 34, 'PDT': 35, '-RRB-': 36, ',': 37, 'UH': 38, 'RP': 39, '.': 40, 'RBR': 41, 'VBG': 42, '$': 43, 'SYM': 44, 'CD': 45, '[PAD]': 46}


# 3. Modeling

In [6]:
from transformers import AutoModelForTokenClassification

model_name = 'QCRI/bert-base-multilingual-cased-pos-english'
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

accuracy = evaluate.load('accuracy')
ignore_label = len(label2id)

def compute_metrics(p):
    preds, labels = p
    mask = labels != ignore_label
    preds = np.argmax(preds, axis=-1)
    return accuracy.compute(
        predictions=preds[mask], 
        references=labels[mask], 
    )


Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([46, 768]) in the checkpoint and torch.Size([47, 768]) in the model i

# 4. Trainer

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./models',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.1179,0.099477,0.975664
2,0.0694,0.057556,0.984581
3,0.0517,0.045646,0.987975
4,0.0456,0.041235,0.989007
5,0.0399,0.038206,0.989539
6,0.0387,0.036355,0.990118
7,0.0323,0.035458,0.990411
8,0.0324,0.034829,0.990484
9,0.0316,0.034378,0.990537
10,0.0309,0.03432,0.990604


TrainOutput(global_step=1720, training_loss=0.07758897974394088, metrics={'train_runtime': 23981.0756, 'train_samples_per_second': 1.142, 'train_steps_per_second': 0.072, 'total_flos': 3579914951838720.0, 'train_loss': 0.07758897974394088, 'epoch': 10.0})

# 5. Inference

In [10]:
# tokenization
test_sentence = "We are exploring the topic of deep learning"
device = "cuda" if torch.cuda.is_available() else 'cpu'
input = torch.as_tensor([tokenizer.convert_tokens_to_ids(test_sentence.split())])
input = input.to(device)

# prediction
outputs = model(input)
_, preds = torch.max(outputs.logits,-1)
preds = preds[0].cpu().numpy()

# decode
pred_tags = ""
for pred in preds:
    pred_tags += id2label[pred] + " "

pred_tags # => PRP VBP RB DT NN IN JJ NN

'PRP VBP VBG DT NN IN JJ NN '