# Importation des modules

In [1]:
import torch
checkpoint = "bert-base-uncased"

# Pré-traitement des données
import csv
import pandas as pd
import datasets
from transformers import DataCollatorForTokenClassification
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split

# Entraînement du classifieur
from transformers import BertForTokenClassification
from transformers import TrainingArguments, Trainer

# Evaluation
import numpy as np
import seqeval

  from .autonotebook import tqdm as notebook_tqdm


# Pre-processing

## Label encoding

In [2]:
label2id = {
    "B-NPI": 0,
    "I-NPI": 1,
    "O": 2
}

id2label = {
    0: "B-NPI",
    1: "I-NPI",
    2: "O"
}

## Importation des données

In [3]:
raw_tokens = []
labels = []

with open("../data/fake_dataset.csv") as csvfile:
    
    reader = csv.reader(csvfile, delimiter=",")
    
    temp_tokens = []
    temp_labels = []
    
    next(reader) # Ignore première ligne (en-tête)
    
    for row in reader:
        
        if row == ["#", "#"]: # Nouvelle phrase
            
            # Ajout des listes tampon
            raw_tokens.append(temp_tokens)
            labels.append(temp_labels)
            
            # On vide les listes tampon
            temp_tokens = []
            temp_labels = []
        
        else:
            temp_tokens.append(row[0])
            temp_labels.append(label2id[row[1].strip()])
            
raw_tokens_train, raw_tokens_test, labels_train, labels_test = train_test_split(raw_tokens, labels, test_size=0.2)

In [4]:
data_train = {
    "tokens": raw_tokens_train,
    "labels": labels_train
}

data_test = {
    "tokens": raw_tokens_test,
    "labels": labels_test
}

# Création d'un objet Dataset
dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(pd.DataFrame(data_train)),
    "test": datasets.Dataset.from_pandas(pd.DataFrame(data_test))
})

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 8
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 2
    })
})

## Mise en forme des données dans le bon format pour BERT

In [6]:
# Création du tokenizer à partir du même checkpoint
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [7]:
def shift_label(label):
    # Transformation des B-NPI en I-NPI quand le token correspond à un subword
    if label % 2 == 1:
        label += 1
    return label

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            new_labels.append(shift_label(labels[word_id]))
    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

processed_dataset = dataset.map(tokenize_and_align_labels, batched=True)

                                                 

# Metrics

In [8]:
label_list = label2id.keys()

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Entraînement du classifieur

In [9]:
model = BertForTokenClassification.from_pretrained(checkpoint, 
                                                    num_labels=3,
                                                    id2label=id2label, 
                                                    label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [12]:
training_args = TrainingArguments(
    output_dir="npi_bio_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    push
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 108893955
 10%|█         | 1/10 [00:01<00:12,  1.43s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.f

{'eval_loss': 0.8799149394035339, 'eval_runtime': 0.0479, 'eval_samples_per_second': 41.784, 'eval_steps_per_second': 20.892, 'epoch': 1.0}


Model weights saved in npi_bio_model\checkpoint-1\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-1\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-1\special_tokens_map.json
 20%|██        | 2/10 [00:03<00:15,  1.96s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

 20%|██        | 2/10 [00:03<00:15,  1.96s/it]Saving model checkpoint to npi_bio_model\checkpoint-2
Configuration saved in npi_bio_model\checkpoint-2\config.json


{'eval_loss': 0.7675561904907227, 'eval_runtime': 0.0497, 'eval_samples_per_second': 40.262, 'eval_steps_per_second': 20.131, 'epoch': 2.0}


Model weights saved in npi_bio_model\checkpoint-2\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-2\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-2\special_tokens_map.json
 30%|███       | 3/10 [00:06<00:15,  2.15s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

 30%|███       | 3/10 [00:06<00:15,  2.15s/it]Saving model checkpoint to npi_bio_model\checkpoint-3
Configuration saved in npi_bio_model\checkpoint-3\config.json


{'eval_loss': 0.6738592386245728, 'eval_runtime': 0.0501, 'eval_samples_per_second': 39.914, 'eval_steps_per_second': 19.957, 'epoch': 3.0}


Model weights saved in npi_bio_model\checkpoint-3\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-3\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-3\special_tokens_map.json
 40%|████      | 4/10 [00:08<00:13,  2.17s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

 40%|████      | 4/10 [00:08<00:13,  2.17s/it]Saving model checkpoint to npi_bio_model\checkpoint-4
Configuration saved in npi_bio_model\checkpoint-4\config.json


{'eval_loss': 0.5955406427383423, 'eval_runtime': 0.0457, 'eval_samples_per_second': 43.739, 'eval_steps_per_second': 21.869, 'epoch': 4.0}


Model weights saved in npi_bio_model\checkpoint-4\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-4\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-4\special_tokens_map.json
 50%|█████     | 5/10 [00:10<00:11,  2.21s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

 50%|█████     | 5/10 [00:10<00:11,  2.21s/it]Saving model checkpoint to npi_bio_model\checkpoint-5
Configuration saved in npi_bio_model\checkpoint-5\config.json


{'eval_loss': 0.5273100137710571, 'eval_runtime': 0.0426, 'eval_samples_per_second': 46.956, 'eval_steps_per_second': 23.478, 'epoch': 5.0}


Model weights saved in npi_bio_model\checkpoint-5\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-5\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-5\special_tokens_map.json
 60%|██████    | 6/10 [00:12<00:08,  2.21s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

 60%|██████    | 6/10 [00:12<00:08,  2.21s/it]Saving model checkpoint to npi_bio_model\checkpoint-6
Configuration saved in npi_bio_model\checkpoint-6\config.json


{'eval_loss': 0.47988927364349365, 'eval_runtime': 0.0445, 'eval_samples_per_second': 44.948, 'eval_steps_per_second': 22.474, 'epoch': 6.0}


Model weights saved in npi_bio_model\checkpoint-6\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-6\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-6\special_tokens_map.json
 70%|███████   | 7/10 [00:16<00:08,  2.84s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

 70%|███████   | 7/10 [00:17<00:08,  2.84s/it]Saving model checkpoint to npi_bio_model\checkpoint-7
Configuration saved in npi_bio_model\checkpoint-7\config.json


{'eval_loss': 0.45590975880622864, 'eval_runtime': 0.0669, 'eval_samples_per_second': 29.906, 'eval_steps_per_second': 14.953, 'epoch': 7.0}


Model weights saved in npi_bio_model\checkpoint-7\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-7\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-7\special_tokens_map.json
 80%|████████  | 8/10 [00:20<00:06,  3.13s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

 80%|████████  | 8/10 [00:20<00:06,  3.13s/it]Saving model checkpoint to npi_bio_model\checkpoint-8
Configuration saved in npi_bio_model\checkpoint-8\config.json


{'eval_loss': 0.44076138734817505, 'eval_runtime': 0.064, 'eval_samples_per_second': 31.249, 'eval_steps_per_second': 15.625, 'epoch': 8.0}


Model weights saved in npi_bio_model\checkpoint-8\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-8\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-8\special_tokens_map.json
 90%|█████████ | 9/10 [00:25<00:03,  3.50s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

 90%|█████████ | 9/10 [00:25<00:03,  3.50s/it]Saving model checkpoint to npi_bio_model\checkpoint-9
Configuration saved in npi_bio_model\checkpoint-9\config.json


{'eval_loss': 0.4324716627597809, 'eval_runtime': 0.0493, 'eval_samples_per_second': 40.529, 'eval_steps_per_second': 20.264, 'epoch': 9.0}


Model weights saved in npi_bio_model\checkpoint-9\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-9\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-9\special_tokens_map.json
100%|██████████| 10/10 [00:30<00:00,  4.01s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16

100%|██████████| 10/10 [00:30<00:00,  4.01s/it]Saving model checkpoint to npi_bio_model\checkpoint-10
Configuration saved in npi_bio_model\checkpoint-10\config.json


{'eval_loss': 0.42870035767555237, 'eval_runtime': 0.0449, 'eval_samples_per_second': 44.499, 'eval_steps_per_second': 22.25, 'epoch': 10.0}


Model weights saved in npi_bio_model\checkpoint-10\pytorch_model.bin
tokenizer config file saved in npi_bio_model\checkpoint-10\tokenizer_config.json
Special tokens file saved in npi_bio_model\checkpoint-10\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from npi_bio_model\checkpoint-10 (score: 0.42870035767555237).
100%|██████████| 10/10 [00:34<00:00,  3.40s/it]

{'train_runtime': 34.088, 'train_samples_per_second': 2.347, 'train_steps_per_second': 0.293, 'train_loss': 0.47608723640441897, 'epoch': 10.0}





TrainOutput(global_step=10, training_loss=0.47608723640441897, metrics={'train_runtime': 34.088, 'train_samples_per_second': 2.347, 'train_steps_per_second': 0.293, 'train_loss': 0.47608723640441897, 'epoch': 10.0})