In [5]:
import re

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
from datasets import Dataset

In [8]:
from datasets import DatasetDict

In [9]:
import transformers

In [10]:
from transformers import AutoTokenizer

In [11]:
from transformers import DataCollatorForTokenClassification

In [12]:
import evaluate

In [13]:
import numpy as np

In [14]:
from transformers import AutoModelForTokenClassification

In [15]:
from transformers import TrainingArguments

In [16]:
from transformers import Trainer

In [18]:
from util_funcs import get_most_recent_file

In [17]:
transformers.__version__

'4.21.3'

# Preparing the dataset

The training data set is a single text file. Each line of the file contains either 
* a word and tag separated by a tab, or 
* a blank line indicating the end of a sentence (or more precisely, couplet).

<code>read_labeled_data()</code> reads the data in and returns
* <b><code>token_sents</code></b> which is a list of lists of token strings, and</br>
* <code><b>tag_sents</code></b> which is a list of lists of tag strings.


In [19]:
def read_labeled_data(path):
    with open(path, 'r', encoding='utf8') as annot_data_file:
        raw_text = annot_data_file.read().strip()
    raw_sents = re.split(r'\n{2}', raw_text)
    token_sents = []
    tag_sents = []
    for sent in raw_sents:
        tokens = []
        tags = []
        for line in sent.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_sents.append(tokens)
        tag_sents.append(tags)
    return token_sents, tag_sents

In [20]:
def print_aligned(token_list, tag_list):
    for token, tag in zip(token_list, tag_list):
        sep_string = '         '
        print(tag, token, sep=sep_string[:(9-len(str(tag)))])

In [23]:
sents_list_tokens, sents_list_tags = read_labeled_data(get_most_recent_file(dir_path='data/tagged', prefix='tagged_data_final'))

In [24]:
sents_list_tokens

[['چو',
  'بنمود',
  'و',
  'برگشت',
  'و',
  'بهرام',
  'رفت',
  'خرامان',
  'بدان',
  'بیشهٔ',
  'کرگ',
  'تفت'],
 ['بدارید',
  'چون',
  'پیش',
  'بود',
  'اصفهان',
  'ز',
  'هر',
  'سو',
  'پراگنده',
  'کارآگهان'],
 ['سه',
  'دیگر',
  'سیامک',
  'ز',
  'توران',
  'سپاه',
  'بشد',
  'با',
  'گرازه',
  'به',
  'آوردگاه'],
 ['ز',
  'هیشوی',
  'قیصر',
  'بپرسد',
  'سخن',
  'نوست',
  'این',
  'نگشتست',
  'باری',
  'کهن'],
 ['برآشفت', 'ز', 'آوازش', 'اسفندیار', 'پیامی', 'فرستاد', 'زی', 'گرگسار'],
 ['ابا',
  'لشکر',
  'نوذر',
  'افراسیاب',
  'چو',
  'دریای',
  'جوشان',
  'بد',
  'و',
  'رود',
  'آب'],
 ['نخستین',
  'ز',
  'تور',
  'ایدر',
  'آمد',
  'بدی',
  'که',
  'برخاست',
  'زو',
  'فرهٔ',
  'ایزدی'],
 ['هم\u200cآورد',
  'با',
  'گیو',
  'نزدیک',
  'شد',
  'جهان',
  'چون',
  'شب',
  'تیره',
  'تاریک',
  'شد'],
 ['دلیران',
  'چو',
  'بهرام',
  'را',
  'یافتند',
  'پر',
  'از',
  'آب',
  'و',
  'خون',
  'دیده',
  'بشتافتند'],
 ['چو',
  'آمد',
  'بران',
  'شارستان',
  'بزرگ',
  'که',
  'می

In [25]:
sents_list_tags

[['O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O'],
 ['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'B-PER'],
 ['O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'B-PER', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O'],
 ['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O'],
 ['O', 'O

In [26]:
print_aligned(sents_list_tokens[0], sents_list_tags[0])

O        چو
O        بنمود
O        و
O        برگشت
O        و
B-PER    بهرام
O        رفت
O        خرامان
O        بدان
O        بیشهٔ
O        کرگ
O        تفت


Creating a train/validation split (the output data has the same format as the input data – list of lists):

In [27]:
train_tokens, val_tokens, train_tags, val_tags = train_test_split(sents_list_tokens, sents_list_tags, test_size=.2)

Inference widget for tag-encodings:

In [73]:
# create a list of unique tags
unique_tags = sorted(set(tag for sent in sents_list_tags for tag in sent))
# assign each tag a number (id): "tag: id" dictionary
tag_to_id = {tag: t_id for t_id, tag in enumerate(unique_tags)}
# reverse dictionary to create "id: tag" dictionary
id_to_tag = {t_id: tag for tag, t_id in tag_to_id.items()}
id_to_tag_2 = {t_id: tag for t_id, tag in enumerate(unique_tags)}

In [74]:
id_to_tag

{0: 'B-LOC', 1: 'B-PER', 2: 'I-LOC', 3: 'I-PER', 4: 'O'}

In [75]:
id_to_tag_2

{0: 'B-LOC', 1: 'B-PER', 2: 'I-LOC', 3: 'I-PER', 4: 'O'}

In [71]:
type(tag_to_id['B-PER'])

int

In [33]:
unique_tags

['B-LOC', 'B-PER', 'I-LOC', 'I-PER', 'O']

In [34]:
print_aligned(unique_tags, [tag_to_id[t] for t in unique_tags])

0        B-LOC
1        B-PER
2        I-LOC
3        I-PER
4        O


Converting the data to a <code>Dataset</code> object:

In [35]:
train_tag_ids = [[tag_to_id[t] for t in tag_list] for tag_list in train_tags]
train_data_dict = {'tokens': train_tokens, 'ner_tags': train_tag_ids}
train_dataset = Dataset.from_dict(train_data_dict)
train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 7278
})

In [36]:
val_tag_ids = [[tag_to_id[t] for t in tag_list] for tag_list in val_tags]
val_data_dict = {'tokens': val_tokens, 'ner_tags': val_tag_ids}
val_dataset = Dataset.from_dict(val_data_dict)
val_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1820
})

And saving the datasets in a <code>DatasetDict</code>:

In [37]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 7278
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1820
    })
})

## Tokenization and alignment

Loading the <code>tokenizer</code> of the pretrained model to consequently convert the tokens to token IDs:

In [38]:
model_checkpoint = 'HooshvareLab/bert-fa-zwnj-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Making sure the tokenizer is backed by the 🤗 Tokenizers library, so there’s a “fast” version available:

In [39]:
tokenizer.is_fast

True

Let's test the <code>tokenizer</code> on the first sentence (couplet) from out data. We are using the tokenizer with the <code>is_split_into_words=True</code> flag to tokenize (subword-tokenization) a pre-tokenized (word-level tokenization) input:

In [40]:
test_input = tokenizer(dataset_dict['train'][0]['tokens'], is_split_into_words=True)
print_aligned(test_input.tokens(), test_input.word_ids())

None     [CLS]
0        روا
0        ##رو
1        [UNK]
2        ز
3        درگاه
4        سام
5        مه
6        بانوان
7        خواندند
7        ##ش
8        به
9        نام
None     [SEP]


In [41]:
len(test_input.tokens())

14

In [42]:
len(dataset_dict['train'][0]['ner_tags'])

10

The tokenizer added the special tokens used by the model (<code>[CLS]</code> at the beginning and <code>[SEP]</code> at the end) and left most of the words untouched. Some words, however, were tokenized into multiple subwords. This introduces a mismatch between the inputs and the tags: the list of tags has only 10 elements, whereas the input now has 14 tokens. Accounting for the special tokens is easy, but we also need to make sure we align all the tags with the proper words.

We need to expand the tag list to match the tokens. First rule to apply:
* special tokens get a label of <code>-100</code>.

Because by default <code>-100</code> is an index that is ignored in the loss function we will use (cross entropy). Then, 
* each token gets the same label as the token that started the word it’s inside,

since they are part of the same entity. Lastly,
* we replace the B- with I- for tokens inside a word, but not at the beginning (since the token does not begin the entity).

In [43]:
def align_labels_with_tokens(tag_ids, word_ids):
    new_tag_ids = []
    current_word_id = None
    for word_id in word_ids:
        if word_id != current_word_id:
            # Start of a new word!
            current_word_id = word_id
            tag_id = -100 if word_id is None else tag_ids[word_id]
            new_tag_ids.append(tag_id)
        elif word_id is None:
            # Special token
            new_tag_ids.append(-100)
        else:
            # Same word as previous token
            tag_id = tag_ids[word_id]
            # If the label is B-XXX we change it to I-XXX
            if tag_id == 0 or tag_id == 1:
                tag_id += 2
            new_tag_ids.append(tag_id)

    return new_tag_ids

Try it out on a sentence from the dataset:

In [44]:
n = 789
test_input = tokenizer(dataset_dict['train'][n]['tokens'], is_split_into_words=True)
aligned_tag_ids = align_labels_with_tokens(dataset_dict['train'][n]['ner_tags'], test_input.word_ids())
print_aligned(test_input.tokens()[1:-1], [id_to_tag[t] for t in aligned_tag_ids[1:-1]])

O        ز
B-PER    خسرو
O        نب
O        ##د
O        پیش
O        ازین
O        کینه
O        چیز
O        کنون
O        کینه
O        بر
O        کین
O        بیف
O        ##زود
O        نیز


Before sub-word tokenization and alignment:

In [45]:
print_aligned(train_tokens[n], train_tags[n])

O        ز
B-PER    خسرو
O        نبد
O        پیش
O        ازین
O        کینه
O        چیز
O        کنون
O        کینه
O        بر
O        کین
O        بیفزود
O        نیز


To preprocess our whole dataset, we need to tokenize all the inputs and apply <code>align_labels_with_tokens()</code> on all the labels. To take advantage of the speed of our fast tokenizer, we will tokenize lots of texts at the same time. We’ll write a function that processes a list of examples and use the <code>Dataset.map()</code> method with the option <code>batched=True</code>. The <code>word_ids()</code> function needs to get the index of the example we want the word IDs of when the inputs to the tokenizer are lists of texts (or in our case, list of lists of words), so we add that too:

In [46]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], truncation=True, is_split_into_words=True
    )
    all_tags = examples['ner_tags']
    new_tags = []
    for i, tags in enumerate(all_tags):
        word_ids = tokenized_inputs.word_ids(i)
        new_tags.append(align_labels_with_tokens(tags, word_ids))

    tokenized_inputs['labels'] = new_tags
    return tokenized_inputs

In [47]:
tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset_dict['train'].column_names,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [48]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 7278
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1820
    })
})

In [49]:
tokenized_datasets.shape

{'train': (7278, 4), 'validation': (1820, 4)}

In [50]:
tokenized_datasets.column_names

{'train': ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
 'validation': ['input_ids', 'token_type_ids', 'attention_mask', 'labels']}

In [51]:
for key, value in tokenized_datasets['train'][0].items():
    print(key, ': ', value, sep='')

input_ids: [2, 25288, 1956, 1, 607, 5988, 2588, 2175, 8920, 15094, 1121, 1923, 2118, 3]
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
labels: [-100, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, -100]


In [52]:
# tokenized_datasets.set_format(type='torch', columns=['attention_mask', 'input_ids', 'tags', 'token_type_ids'])

In [None]:
# tokenized_datasets['train'].format

In [None]:
# tokenized_datasets.cleanup_cache_files()

# Fine-tuning the model with the <code>Trainer</code> API

## Data collation

We can’t just use a DataCollatorWithPadding because that only pads the inputs (input IDs, attention mask, and token type IDs). Here our labels should be padded the exact same way as the inputs so that they stay the same size, using <code>-100</code> as a value so that the corresponding predictions are ignored in the loss computation.

This is all done by a DataCollatorForTokenClassification. Like the DataCollatorWithPadding, it takes the tokenizer used to preprocess the inputs:

In [53]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [54]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch['labels']

tensor([[-100,    4,    4,    4,    4,    4,    1,    4,    4,    4,    4,    4,
            4, -100],
        [-100,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
            4, -100]])

In [55]:
for i in range(2):
    print(tokenized_datasets['train'][i]['labels'])

[-100, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, -100]
[-100, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, -100]


## Metrics

To have the Trainer compute a metric every epoch, we will need to define a <code>compute_metrics()</code> function that takes the arrays of predictions and labels, and returns a dictionary with the metric names and values.

The traditional framework used to evaluate token classification prediction is <i>seqeval</i>. To use this metric, we first need to install the <i>seqeval</i> and <i>evaluate</i> libraries:

In [None]:
# ! pip install seqeval
# ! pip install evaluate

We can then load it via the <code>evaluate.load()</code> function (after importing <code>evaluate</code>):

In [56]:
metric = evaluate.load("seqeval")

This metric does not behave like the standard accuracy: it will actually take the lists of labels as strings, not integers, so we will need to fully decode the predictions and labels before passing them to the metric. Let’s see how it works. First, we’ll get the labels for a training example:

In [57]:
labels = []
for l in dataset_dict['train']['ner_tags']:
    if 0 in l and 1 in l:
        labels = l.copy()
        break
labels = [id_to_tag[i] for i in labels]
labels

['O', 'O', 'B-LOC', 'O', 'B-PER', 'O', 'O', 'O', 'O']

We can then create fake predictions for those by just changing the value at index 2:

In [59]:
predictions = labels.copy()
predictions[2] = "O"
predictions

['O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']

Note that the metric takes a list of predictions (not just one) and a list of labels:

In [60]:
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.5,
 'overall_f1': 0.6666666666666666,
 'overall_accuracy': 0.8888888888888888}

This is sending back a lot of information! We get the precision, recall, and F1 score for each separate entity, as well as overall. You can tweak the <code>compute_metrics()</code> function to return exactly the metrics you would like reported.

This <code>compute_metrics()</code> function first takes the argmax of the logits to convert them to predictions (as usual, the logits and the probabilities are in the same order, so we don’t need to apply the softmax). Then we have to convert both labels and predictions from integers to strings. We remove all the values where the label is <code>-100</code>, then pass the results to the <code>metric.compute()</code> method:

In [61]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id_to_tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return metric.compute(predictions=true_predictions, references=true_labels) 

Now that this is done, we are almost ready to define our <code>Trainer</code>. We just need a model to fine-tune!

## Defining the model

Since we are working on a token classification problem, we will use the <code>AutoModelForTokenClassification</code> class. The main thing to remember when defining this model is to pass along some information on the number of labels we have. The easiest way to do this is to pass that number with the <code>num_labels</code> argument, but if we want a nice inference widget working like the one we saw at the beginning of this section, it’s better to set the correct label correspondences instead.

They should be set by two dictionaries, <code>id2label</code> and <code>label2id</code>, which contain the mappings from ID to label and vice versa:

In [62]:
id2label = id_to_tag.copy()
id2label

{0: 'B-LOC', 1: 'B-PER', 2: 'I-LOC', 3: 'I-PER', 4: 'O'}

In [63]:
label2id = tag_to_id.copy()
label2id

{'B-LOC': 0, 'B-PER': 1, 'I-LOC': 2, 'I-PER': 3, 'O': 4}

Now we can just pass them to the <code>AutoModelForTokenClassification.from_pretrained()</code> method, and they will be set in the model’s configuration and properly saved:

In [64]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at HooshvareLab/bert-fa-zwnj-base were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-

Let’s double-check that our model has the right number of labels:

In [65]:
model.config.num_labels

5

## Fine-tuning the model

We are now ready to train our model! We just need to do one last thing before we define our <code>Trainer</code>: define our training arguments.

In [68]:
args = TrainingArguments(
    "bert-fa-zwnj-base-shahnameh-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

Finally, we just pass everything to the <code>Trainer</code> and launch the training:

In [69]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [70]:
trainer.train()

***** Running training *****
  Num examples = 7278
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2730


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model('fine-tuned_model')