In [1]:
!pip install datasets transformers seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 6.9 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 73.4 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 76.7 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 87.4 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux20

Then you need to install Git-LFS. Uncomment the following instructions:

In [2]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [4]:
!huggingface-cli login



        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in yo

# **Fine-tuning a model on a token classification task**

In [5]:
import transformers

print(transformers.__version__)

4.21.3


In [6]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "alexaapo/greek_legal_bert_v2"
batch_size = 16

## **Loading the dataset**

In [7]:
#connect to G-Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\s?\n', raw_text)

    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split(' ')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_wnut('/content/gdrive/MyDrive/Colab Notebooks/ORG_IOB_tags.txt')

In [9]:
texts = [ele for ele in texts if ele != ['']]
tags = [ele for ele in tags if ele != ['']]

Have a look on data! 

In [10]:
print(texts[0][0:100], tags[0][0:100], sep='\n')

['"', '1.', 'Αποστολή', 'του', 'Υπουργείου', 'Ανάπτυξης', 'και', 'Επενδύσεων', 'είναι', 'η', 'ισόρροπη', 'οικονομική', 'ανάπτυξη', 'των', 'περιφερειών', 'της', 'χώρας', ',', 'και', 'ιδίως', 'η', 'αύξηση', 'του', 'κατά', 'κεφαλήν', 'ακαθάριστου', 'εγχώριου', 'παραγόμενου', 'προϊόντος', ',', 'όπως', 'προκύπτει', 'από', 'την', 'ενίσχυση', 'των', 'επενδύσεων', 'ιδιωτικού', 'και', 'δημόσιου', 'τομέα', 'και', 'την', 'αύξηση', 'του', 'διαθέσιμου', 'εισοδήματος', ',', 'επιχειρήσεων', 'και', 'πολιτών', ',', 'που', 'καταναλώνεται', '.', '"']
['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


## **Preprocessing the data**

Split Dataset

In [11]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

Dictionary of Tags to IDs and vise-versa

In [12]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [13]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/89.0 [00:00<?, ?B/s]

Since our inputs have already been split into words, we pass the list of words to your tokenzier with the argument `is_split_into_words=True`:

In [14]:
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Encode our Dataset

In [15]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [16]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

In [17]:
print(train_dataset[0]['input_ids'],train_dataset[0]['labels'])

tensor([    2,   814,    13,  3233,   991,  4210,    16,  6027,   414,  4662,
          426,  4216,   451, 17959, 12496,  3554,  2523,  3674,    12,   553,
        19168, 11002,    13,    16,  4544,  1568,    18,     3,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

## **Fine-tuning the model**

Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about token classification, we use the `AutoModelForTokenClassification` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us. The only thing we have to specify is the number of labels for our problem (which we can get from the features, as seen before):

In [18]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(unique_tags), id2label= id2tag, label2id= tag2id)

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at alexaapo/greek_legal_bert_v2 were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

To instantiate a `Trainer`, we will need to define three more things. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:

In [24]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "steps", 
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_steps=20,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
    metric_for_best_model='eval_f1',
    load_best_model_at_end = True,)

The last thing to define for our `Trainer` is how to compute the metrics from the predictions. Here we will load the [`seqeval`](https://github.com/chakki-works/seqeval) metric (which is commonly used to evaluate results on the CONLL dataset) via the Datasets library.

In [21]:
from datasets import load_metric
metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

This metric takes list of labels for the predictions and references:

In [22]:
labels = [id2tag[i] for i in train_labels[0] if i!= -100]
metric.compute(predictions=[labels], references=[labels])

{'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

So we will need to do a bit of post-processing on our predictions:
- select the predicted index (with the maximum logit) for each token
- convert it to its string label
- ignore everywhere we set a label of -100

The following function does all this post-processing on the result of `Trainer.evaluate` (which is a namedtuple containing predictions and labels) before applying the metric:

In [23]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Note that we drop the precision/recall/f1 computed for each category and only focus on the overall precision/recall/f1/accuracy.

Then we just need to pass all of this along with our datasets to the `Trainer`:

In [25]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,           # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()

Cloning https://huggingface.co/amichailidis/greek_legal_bert_v2-finetuned-ner into local empty directory.
***** Running training *****
  Num examples = 404
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 78


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
20,No log,0.273604,0.461538,0.174757,0.253521,0.906385
40,No log,0.133868,0.70297,0.68932,0.696078,0.958333
60,No log,0.10338,0.714286,0.800971,0.755149,0.97132


***** Running Evaluation *****
  Num examples = 101
  Batch size = 16
***** Running Evaluation *****
  Num examples = 101
  Batch size = 16
***** Running Evaluation *****
  Num examples = 101
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=78, training_loss=0.22668948540320763, metrics={'train_runtime': 53.6547, 'train_samples_per_second': 22.589, 'train_steps_per_second': 1.454, 'total_flos': 240613619308056.0, 'train_loss': 0.22668948540320763, 'epoch': 3.0})

The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:

In [None]:
trainer.evaluate()

To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:

In [27]:
predictions, labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 101
  Batch size = 16


{'ORG': {'precision': 0.7533632286995515,
  'recall': 0.8155339805825242,
  'f1': 0.7832167832167831,
  'number': 206},
 'overall_precision': 0.7533632286995515,
 'overall_recall': 0.8155339805825242,
 'overall_f1': 0.7832167832167831,
 'overall_accuracy': 0.9729437229437229}

## Push to Hub Directly


In [26]:
trainer.push_to_hub()

Saving model checkpoint to greek_legal_bert_v2-finetuned-ner
Configuration saved in greek_legal_bert_v2-finetuned-ner/config.json
Model weights saved in greek_legal_bert_v2-finetuned-ner/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 3.34k/429M [00:00<?, ?B/s]

Upload file runs/Sep08_09-20-50_cf69a67d4524/events.out.tfevents.1662628880.cf69a67d4524.75.0:  61%|######1   …

Upload file runs/Sep08_09-20-50_cf69a67d4524/1662628880.1003811/events.out.tfevents.1662628880.cf69a67d4524.75…

Upload file training_args.bin: 100%|##########| 3.30k/3.30k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/amichailidis/greek_legal_bert_v2-finetuned-ner
   cde110c..5bd9169  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/amichailidis/greek_legal_bert_v2-finetuned-ner
   cde110c..5bd9169  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Token Classification', 'type': 'token-classification'}, 'metrics': [{'name': 'Precision', 'type': 'precision', 'value': 0.7142857142857143}, {'name': 'Recall', 'type': 'recall', 'value': 0.8009708737864077}, {'name': 'F1', 'type': 'f1', 'value': 0.7551487414187643}, {'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9713203463203464}]}
To https://huggingface.co/amichailidis/greek_legal_bert_v2-finetuned-ner
   5bd9169..49b4043  main -> main

   5bd9169..49b4043  main -> main



'https://huggingface.co/amichailidis/greek_legal_bert_v2-finetuned-ner/commit/5bd9169990e5906118bddec293bb7541cbd0fca0'

In [30]:
tokenizer.push_to_hub('greek_legal_bert_v2-finetuned-ner')

tokenizer config file saved in greek_legal_bert_v2-finetuned-ner/tokenizer_config.json
Special tokens file saved in greek_legal_bert_v2-finetuned-ner/special_tokens_map.json
To https://huggingface.co/amichailidis/greek_legal_bert_v2-finetuned-ner
   49b4043..6c27927  main -> main

   49b4043..6c27927  main -> main



'https://huggingface.co/amichailidis/greek_legal_bert_v2-finetuned-ner/commit/6c279278f71c9ad506c1e9bdd267376951198824'

## Save Model Locally

In [31]:
model.save_pretrained("gdrive/MyDrive/path/to/model")


Configuration saved in gdrive/MyDrive/path/to/model/config.json
Model weights saved in gdrive/MyDrive/path/to/model/pytorch_model.bin


In [32]:
tokenizer.save_pretrained("gdrive/MyDrive/path/to/model")


tokenizer config file saved in gdrive/MyDrive/path/to/model/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/path/to/model/special_tokens_map.json


('gdrive/MyDrive/path/to/model/tokenizer_config.json',
 'gdrive/MyDrive/path/to/model/special_tokens_map.json',
 'gdrive/MyDrive/path/to/model/vocab.txt',
 'gdrive/MyDrive/path/to/model/added_tokens.json',
 'gdrive/MyDrive/path/to/model/tokenizer.json')

## Perform tests

In [33]:
model_test = AutoModelForTokenClassification.from_pretrained('/content/gdrive/MyDrive/path/to/model')    
tokenizer_test = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/path/to/model')

loading configuration file /content/gdrive/MyDrive/path/to/model/config.json
Model config BertConfig {
  "_name_or_path": "/content/gdrive/MyDrive/path/to/model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "I-ORG",
    "2": "B-ORG"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-ORG": 2,
    "I-ORG": 1,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 35000
}

loading weights file /content/gdrive/MyDrive/path/to/model/pytorch_model.bin
All 

In [35]:
from transformers import TokenClassificationPipeline

pipe = TokenClassificationPipeline(model=model_test, tokenizer=tokenizer_test)
pipe("(3) Γραφείο Συμβούλου Ακεραιότητας (ΓΣΑ)")

[{'entity': 'B-ORG',
  'score': 0.7395654,
  'index': 4,
  'word': 'γραφειο',
  'start': 4,
  'end': 11},
 {'entity': 'I-ORG',
  'score': 0.94267017,
  'index': 5,
  'word': 'συμβουλου',
  'start': 12,
  'end': 21},
 {'entity': 'I-ORG',
  'score': 0.9736945,
  'index': 6,
  'word': 'ακεραιοτητας',
  'start': 22,
  'end': 34},
 {'entity': 'B-ORG',
  'score': 0.71079195,
  'index': 8,
  'word': 'γ',
  'start': 36,
  'end': 37},
 {'entity': 'I-ORG',
  'score': 0.5607338,
  'index': 9,
  'word': '##σα',
  'start': 37,
  'end': 39}]