In [1]:
!python -V

Python 3.10.11


In [8]:
import os
import itertools

import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

In [10]:
print("torch:", torch.__version__)
print("Is GPU available:", torch.cuda.is_available())

torch: 2.0.0
Is GPU available: True


In [3]:
def get_all_tokens_and_ner_tags(directory):
    return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)
    

def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list] 
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})
  
  
def get_un_token_dataset(train_directory, test_directory):
    train_df = get_all_tokens_and_ner_tags(train_directory)
    test_df = get_all_tokens_and_ner_tags(test_directory)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return (train_dataset, test_dataset)

In [4]:
label_list = ['O','B-MISC','I-MISC','B-PER','I-PER','B-ORG','I-ORG','B-LOC','I-LOC']
label_encoding_dict = {'I-PRG': 2,'I-I-MISC': 2, 'I-OR': 6, 'O': 0, 'I-': 0, 'VMISC': 0, 'B-PER': 3, 'I-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-LOC': 7, 'I-LOC': 8, 'B-MISC': 1, 'I-MISC': 2}

task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 8
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

train_dataset, test_dataset = get_un_token_dataset(train_directory='../data/train/', test_directory='../data/test/')

In [5]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
train_tokenized_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

                                                                 

In [9]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('../models/un-ner.model')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

{'eval_loss': 0.05339331924915314, 'eval_precision': 0.7843881856540085, 'eval_recall': 0.8618451553082985, 'eval_f1': 0.8212944554892866, 'eval_accuracy': 0.982084804418499, 'eval_runtime': 7.0686, 'eval_samples_per_second': 293.409, 'eval_steps_per_second': 36.782, 'epoch': 1.0}


 36%|███▋      | 500/1374 [00:52<01:20, 10.86it/s]

{'loss': 0.0876, 'learning_rate': 6.360989810771471e-05, 'epoch': 1.09}


                                                  
 67%|██████▋   | 916/1374 [01:40<00:39, 11.60it/s]

{'eval_loss': 0.05892335996031761, 'eval_precision': 0.8255506607929516, 'eval_recall': 0.8687992582290218, 'eval_f1': 0.8466229952563813, 'eval_accuracy': 0.983465585346244, 'eval_runtime': 7.7334, 'eval_samples_per_second': 268.186, 'eval_steps_per_second': 33.62, 'epoch': 2.0}


 73%|███████▎  | 1000/1374 [01:49<00:35, 10.54it/s]

{'loss': 0.0277, 'learning_rate': 2.7219796215429405e-05, 'epoch': 2.18}


                                                   
100%|██████████| 1374/1374 [02:34<00:00,  8.92it/s]


{'eval_loss': 0.05757793039083481, 'eval_precision': 0.845216606498195, 'eval_recall': 0.8683356513676402, 'eval_f1': 0.8566201692202149, 'eval_accuracy': 0.9851784528262313, 'eval_runtime': 7.7402, 'eval_samples_per_second': 267.953, 'eval_steps_per_second': 33.591, 'epoch': 3.0}
{'train_runtime': 154.1108, 'train_samples_per_second': 71.189, 'train_steps_per_second': 8.916, 'train_loss': 0.04624308283499061, 'epoch': 3.0}


100%|██████████| 260/260 [00:08<00:00, 31.19it/s]
