In [None]:
!pip install datasets transformers seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 7.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 81.2 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.3 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 96.0 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 89.4 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux20

Then you need to install Git-LFS. Uncomment the following instructions:

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


# **Fine-tuning a model on a token classification task**

In [None]:
import transformers

print(transformers.__version__)

4.21.3


In [None]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "alexaapo/greek_legal_bert_v2"
batch_size = 16

## **Loading the dataset**

In [None]:
#connect to G-Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\s?\n', raw_text)

    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split(' ')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_wnut('/content/gdrive/MyDrive/Colab Notebooks/ORG_IOB_tags.txt')

In [None]:
texts = [ele for ele in texts if ele != ['']]
tags = [ele for ele in tags if ele != ['']]

Have a look on data! 

In [None]:
print(texts[0][0:100], tags[0][0:100], sep='\n')

['"', '1.', 'Αποστολή', 'του', 'Υπουργείου', 'Ανάπτυξης', 'και', 'Επενδύσεων', 'είναι', 'η', 'ισόρροπη', 'οικονομική', 'ανάπτυξη', 'των', 'περιφερειών', 'της', 'χώρας', ',', 'και', 'ιδίως', 'η', 'αύξηση', 'του', 'κατά', 'κεφαλήν', 'ακαθάριστου', 'εγχώριου', 'παραγόμενου', 'προϊόντος', ',', 'όπως', 'προκύπτει', 'από', 'την', 'ενίσχυση', 'των', 'επενδύσεων', 'ιδιωτικού', 'και', 'δημόσιου', 'τομέα', 'και', 'την', 'αύξηση', 'του', 'διαθέσιμου', 'εισοδήματος', ',', 'επιχειρήσεων', 'και', 'πολιτών', ',', 'που', 'καταναλώνεται', '.', '"']
['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


## **Preprocessing the data**

Split Dataset

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

Dictionary of Tags to IDs and vise-versa

In [None]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/89.0 [00:00<?, ?B/s]

Since our inputs have already been split into words, we pass the list of words to your tokenzier with the argument `is_split_into_words=True`:

In [None]:
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Encode our Dataset

In [None]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

In [None]:
print(train_dataset[0]['input_ids'],train_dataset[0]['labels'])

tensor([    2,  4318,    13,   135,  2536,   426, 28303,   274,   414,   426,
        15305,   451, 21932,   422,   554, 32197,   142,   531,   414, 14963,
        21513,  5658,  5029,   414,  1988,   972,  2137,    18,     3,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

## **Fine-tuning the model**

Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about token classification, we use the `AutoModelForTokenClassification` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us. The only thing we have to specify is the number of labels for our problem (which we can get from the features, as seen before):

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(unique_tags))

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at alexaapo/greek_legal_bert_v2 were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

To instantiate a `Trainer`, we will need to define three more things. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


The last thing to define for our `Trainer` is how to compute the metrics from the predictions. Here we will load the [`seqeval`](https://github.com/chakki-works/seqeval) metric (which is commonly used to evaluate results on the CONLL dataset) via the Datasets library.

In [None]:
from datasets import load_metric
metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

This metric takes list of labels for the predictions and references:

In [None]:
labels = [id2tag[i] for i in train_labels[0] if i!= -100]
metric.compute(predictions=[labels], references=[labels])

{'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

So we will need to do a bit of post-processing on our predictions:
- select the predicted index (with the maximum logit) for each token
- convert it to its string label
- ignore everywhere we set a label of -100

The following function does all this post-processing on the result of `Trainer.evaluate` (which is a namedtuple containing predictions and labels) before applying the metric:

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Note that we drop the precision/recall/f1 computed for each category and only focus on the overall precision/recall/f1/accuracy.

Then we just need to pass all of this along with our datasets to the `Trainer`:

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,           # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 404
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 130


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.181193,0.697802,0.808917,0.749263,0.967442
2,No log,0.193741,0.698324,0.796178,0.744048,0.968106
3,No log,0.189425,0.72,0.802548,0.759036,0.968771
4,No log,0.196384,0.705556,0.808917,0.753709,0.96711
5,No log,0.201354,0.705556,0.808917,0.753709,0.966777


***** Running Evaluation *****
  Num examples = 101
  Batch size = 16
***** Running Evaluation *****
  Num examples = 101
  Batch size = 16
***** Running Evaluation *****
  Num examples = 101
  Batch size = 16
***** Running Evaluation *****
  Num examples = 101
  Batch size = 16
***** Running Evaluation *****
  Num examples = 101
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=130, training_loss=0.003036463260650635, metrics={'train_runtime': 83.5934, 'train_samples_per_second': 24.165, 'train_steps_per_second': 1.555, 'total_flos': 401022698846760.0, 'train_loss': 0.003036463260650635, 'epoch': 5.0})

The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 101
  Batch size = 16


{'eval_loss': 0.20135380327701569,
 'eval_precision': 0.7055555555555556,
 'eval_recall': 0.8089171974522293,
 'eval_f1': 0.7537091988130564,
 'eval_accuracy': 0.9667774086378738,
 'eval_runtime': 0.7194,
 'eval_samples_per_second': 140.396,
 'eval_steps_per_second': 9.73,
 'epoch': 5.0}

To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:

In [None]:
predictions, labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 101
  Batch size = 16


{'ORG': {'precision': 0.7692307692307693,
  'recall': 0.8241758241758241,
  'f1': 0.7957559681697612,
  'number': 182},
 'overall_precision': 0.7692307692307693,
 'overall_recall': 0.8241758241758241,
 'overall_f1': 0.7957559681697612,
 'overall_accuracy': 0.9765647327807945}

## Save Model

In [None]:
model.save_pretrained("gdrive/MyDrive/path/to/model")


Configuration saved in gdrive/MyDrive/path/to/model/config.json
Model weights saved in gdrive/MyDrive/path/to/model/pytorch_model.bin


In [None]:
tokenizer.save_pretrained("gdrive/MyDrive/path/to/model")


tokenizer config file saved in gdrive/MyDrive/path/to/model/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/path/to/model/special_tokens_map.json


('gdrive/MyDrive/path/to/model/tokenizer_config.json',
 'gdrive/MyDrive/path/to/model/special_tokens_map.json',
 'gdrive/MyDrive/path/to/model/vocab.txt',
 'gdrive/MyDrive/path/to/model/added_tokens.json',
 'gdrive/MyDrive/path/to/model/tokenizer.json')

## Make tests

In [None]:
model_test = AutoModelForTokenClassification.from_pretrained('/content/gdrive/MyDrive/path/to/model')    
tokenizer_test = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/path/to/model')

loading configuration file /content/gdrive/MyDrive/path/to/model/config.json
Model config BertConfig {
  "_name_or_path": "/content/gdrive/MyDrive/path/to/model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 35000
}

loading weights file /content/gdrive/MyDrive/path/to/model/py

In [None]:
from transformers import TokenClassificationPipeline

pipe = TokenClassificationPipeline(model=model_test, tokenizer=tokenizer_test)
pipe("Γραφείο Υπευθύνου Προστασίας Δεδομένων (ΥΠΔ) (3) Γραφείο Συμβούλου Ακεραιότητας (ΓΣΑ) (4) Μονάδα Εσωτερικού Ελέγχου (ΜΕΕ")

[{'entity': 'LABEL_0',
  'score': 0.9356431,
  'index': 1,
  'word': 'γραφειο',
  'start': 0,
  'end': 7},
 {'entity': 'LABEL_2',
  'score': 0.9999579,
  'index': 2,
  'word': 'υπευθυνου',
  'start': 8,
  'end': 17},
 {'entity': 'LABEL_2',
  'score': 0.9999553,
  'index': 3,
  'word': 'προστασιας',
  'start': 18,
  'end': 28},
 {'entity': 'LABEL_2',
  'score': 0.99992347,
  'index': 4,
  'word': 'δεδομενων',
  'start': 29,
  'end': 38},
 {'entity': 'LABEL_1',
  'score': 0.9988728,
  'index': 5,
  'word': '(',
  'start': 39,
  'end': 40},
 {'entity': 'LABEL_0',
  'score': 0.9996486,
  'index': 6,
  'word': 'υπ',
  'start': 40,
  'end': 42},
 {'entity': 'LABEL_2',
  'score': 0.99952424,
  'index': 7,
  'word': '##δ',
  'start': 42,
  'end': 43},
 {'entity': 'LABEL_1',
  'score': 0.99954706,
  'index': 8,
  'word': ')',
  'start': 43,
  'end': 44},
 {'entity': 'LABEL_1',
  'score': 0.9996841,
  'index': 9,
  'word': '(',
  'start': 45,
  'end': 46},
 {'entity': 'LABEL_1',
  'score': 0.999