In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [2]:
import torch
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, TrainingArguments, Trainer
import datasets
from datasets import Dataset

In [3]:
model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load dataset

In [4]:
lang = 'en'

if lang == 'en':
    TAG_FILE = 'data/ptb.2-21.tgs'
    TOKEN_FILE = 'data/ptb.2-21.txt'
    TEST_FILE = 'data/ptb.22.txt'
    TEST_TAGS = 'data/ptb.22.tgs'
    OUT_FILE = 'my_model.out'
elif lang == 'jv':
    TAG_FILE = 'data/jv.train.tgs'
    TOKEN_FILE = 'data/jv.train.txt'
    TEST_FILE = 'data/jv.test.txt'
    TEST_TAGS = 'data/jv.test.tgs'
    OUT_FILE = 'my_model_jv.out'
elif lang == 'btb':
    TAG_FILE = 'data/btb.train.tgs'
    TOKEN_FILE = 'data/btb.train.txt'
    TEST_FILE = 'data/btb.test.txt'
    TEST_TAGS = 'data/btb.test.tgs'
    OUT_FILE = 'my_model_btb.out'
else:
    print('No such language!')

In [5]:
with open(TAG_FILE) as tag_file, open(TOKEN_FILE) as token_file:
        tags = pd.Series(tag_file.readlines())
        tokens = pd.Series(token_file.readlines())

        if len(tags) != len(tokens):
            raise ValueError("Length is different for two files!")
        
tags = tags.str.split()
tokens = tokens.str.split()

train_dataset = pd.concat([tags, tokens], axis=1)
train_dataset.rename({0: 'tags', 1: 'tokens'}, axis=1, inplace=True)
train_dataset = Dataset.from_pandas(train_dataset)

In [6]:
def get_label_list(labels):
    # copied from https://github.com/huggingface/transformers/blob/66fd3a8d626a32989f4569260db32785c6cbf42a/examples/pytorch/token-classification/run_ner.py#L320
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

In [7]:
all_labels_train = get_label_list(train_dataset["tags"])
train_dataset = train_dataset.cast_column("tags", datasets.Sequence(datasets.ClassLabel(names=all_labels_train)))

Casting the dataset:   0%|          | 0/39832 [00:00<?, ? examples/s]

In [8]:
train_dataset, dev_dataset = train_dataset.train_test_split(test_size=0.1).values()

In [9]:
dataset = datasets.DatasetDict({'train': train_dataset, 'dev': dev_dataset})

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tags', 'tokens'],
        num_rows: 35848
    })
    dev: Dataset({
        features: ['tags', 'tokens'],
        num_rows: 3984
    })
})

# Preprocess dataset

In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/35848 [00:00<?, ? examples/s]

Map:   0%|          | 0/3984 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Evaluate

In [14]:
import evaluate

seqeval = evaluate.load("seqeval")

In [15]:
import numpy as np

label_list = dataset["train"].features[f"tags"].feature.names
example = dataset["train"][0]
labels = [label_list[i] for i in example["tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Load Model

In [16]:
id2label = {idx: lbl for idx, lbl in enumerate(label_list)}
label2id = {lbl: idx for idx, lbl in enumerate(label_list)}

In [17]:
torch.cuda.is_available()

True

In [18]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [19]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True).to(device)

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([46, 768]) in the checkpoint and torch.Size([45, 768]) in the model i

# Inference Before Training

In [20]:
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [21]:
from transformers import pipeline

classifier = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

In [22]:
import subprocess
with open(TEST_FILE) as fin, open(OUT_FILE, 'w') as fout:
    for line in fin.readlines():
        pos = classifier(line)
        predicted = [tag_info['entity'] for tag_info in pos if not tag_info['word'].startswith('##')]
        fout.write(" ".join(predicted) + '\n')
subprocess.run(['python', 'tag_acc.py', TEST_TAGS, OUT_FILE])



error rate by word:       0.9545329910013212  (38293 errors out of 40117)
error rate by sentence:   1.0  (1700 errors out of 1700)


CompletedProcess(args=['python', 'tag_acc.py', 'data/ptb.22.tgs', 'my_model.out'], returncode=0)

# Train

In [23]:
model.train()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [24]:
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=3e-4,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_first_step=True,
    logging_strategy="steps",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,0.1222,0.071271,0.965321,0.968125,0.966721,0.977147
200,0.081,0.069491,0.965693,0.966812,0.966252,0.97686
300,0.0734,0.071859,0.965212,0.969246,0.967225,0.977646
400,0.0588,0.066123,0.966387,0.969182,0.967783,0.977837
500,0.0598,0.068149,0.965873,0.968137,0.967004,0.977476
600,0.0517,0.069659,0.966457,0.968341,0.967398,0.97772
700,0.0433,0.065733,0.969186,0.970099,0.969642,0.979302
800,0.0413,0.063613,0.969413,0.971093,0.970252,0.979811
900,0.0351,0.065887,0.968567,0.97122,0.969892,0.979419
1000,0.0294,0.066583,0.969,0.970864,0.969931,0.979599




TrainOutput(global_step=1405, training_loss=0.05128551634605245, metrics={'train_runtime': 650.8497, 'train_samples_per_second': 275.394, 'train_steps_per_second': 2.159, 'total_flos': 7522880809795200.0, 'train_loss': 0.05128551634605245, 'epoch': 5.0})

# Inference After Training

In [25]:
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [26]:
from transformers import pipeline

classifier = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

In [27]:
import subprocess
with open(TEST_FILE) as fin, open(OUT_FILE, 'w') as fout:
    for line in fin.readlines():
        pos = classifier(line)
        predicted = [tag_info['entity'] for tag_info in pos if not tag_info['word'].startswith('##')]
        fout.write(" ".join(predicted) + '\n')
subprocess.run(['python', 'tag_acc.py', TEST_TAGS, OUT_FILE])



error rate by word:       0.5127252785602113  (20569 errors out of 40117)
error rate by sentence:   0.8205882352941176  (1395 errors out of 1700)


CompletedProcess(args=['python', 'tag_acc.py', 'data/ptb.22.tgs', 'my_model.out'], returncode=0)