# Importation des modules

In [135]:
import torch
from transformers import pipeline

checkpoint = "bert-base-uncased"

# Pré-traitement des données
import csv
import pandas as pd
import datasets
from transformers import DataCollatorForTokenClassification
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split

# Entraînement du classifieur
from transformers import BertForTokenClassification
from transformers import TrainingArguments, Trainer
from torch.optim import NAdam, Adam
from transformers import get_scheduler

# Evaluation
import numpy as np
import evaluate
from sklearn.preprocessing import LabelEncoder

# Pre-processing

## Label encoding

In [120]:
label2id = {
    "B-NPI": 0,
    "I-NPI": 1,
    "O": 2
}

id2label = {
    0: "B-NPI",
    1: "I-NPI",
    2: "O"
}

## Importation des données

In [121]:
raw_tokens = []
labels = []

with open("../data/fake_dataset.csv") as csvfile:
    
    reader = csv.reader(csvfile, delimiter=",")
    
    temp_tokens = []
    temp_labels = []
    
    next(reader) # Ignore première ligne (en-tête du CSV)
    
    for row in reader:
        
        if row == ["#", "#"]: # Nouvelle phrase
            
            # Ajout des listes tampon
            raw_tokens.append(temp_tokens)
            labels.append(temp_labels)
            
            # On vide les listes tampon
            temp_tokens = []
            temp_labels = []
        
        else:
            temp_tokens.append(row[0])
            temp_labels.append(label2id[row[1].strip()])

# Split en train/test
raw_tokens_train, raw_tokens_test, labels_train, labels_test = train_test_split(raw_tokens, labels, test_size=0.2)

In [122]:
data_train = {
    "tokens": raw_tokens_train,
    "labels": labels_train
}

data_test = {
    "tokens": raw_tokens_test,
    "labels": labels_test
}

# Création d'un objet Dataset comportant le train et le test
dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(pd.DataFrame(data_train)),
    "test": datasets.Dataset.from_pandas(pd.DataFrame(data_test))
})

In [123]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 8
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 2
    })
})

## Mise en forme des données dans le bon format pour BERT

In [124]:
# Création du tokenizer à partir du même checkpoint
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer,
                                                   max_length=20,
                                                   padding="max_length")

loading file vocab.txt from cache at C:\Users\aengp/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\vocab.txt
loading file tokenizer.json from cache at C:\Users\aengp/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\aengp/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\aengp/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient

In [125]:
def shift_label(label):
    # Transformation des B-NPI en I-NPI quand le token correspond à un subword
    if label % 2 == 1:
        label += 1
    return label

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            new_labels.append(shift_label(labels[word_id]))
    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],
                                 truncation=True,
                                 is_split_into_words=True) # Déjà tokenizé
    new_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

processed_dataset = dataset.map(tokenize_and_align_labels, batched=True)




















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

In [126]:
processed_dataset["train"]["labels"]

[[-100, 2, 2, 2, 2, 2, 2, -100],
 [-100, 2, 2, 2, 2, 0, 2, -100],
 [-100, 2, 2, 2, 2, 0, 2, -100],
 [-100, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, -100],
 [-100, 2, 2, 2, 0, 1, 2, -100],
 [-100, 2, 2, 2, 2, 2, 0, 2, 2, -100],
 [-100, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -100],
 [-100, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -100]]

# Metrics

In [134]:
metric = evaluate.load("accuracy")

In [128]:
label_list = list(label2id.keys())

# def compute_metrics(p):
#     predictions, labels = p
    
#     predictions = np.argmax(predictions, axis=2)

#     # Initialize the label encoder
#     encoder = LabelEncoder()
#     encoder.fit(label_list)

#     # Convert the predicted and true labels to integer values
#     true_predictions = [[encoder.transform([label_list[p] if l != -100 else 'O'])[0] for (p, l) in zip(prediction, label)] for prediction, label in zip(predictions, labels)]
#     true_labels = [[encoder.transform([label_list[l] if l != -100 else 'O'])[0] for (p, l) in zip(prediction, label)] for prediction, label in zip(predictions, labels)]

#     print("=============================================")
#     print(true_labels)
#     print("=============================================")

#     results = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }
    
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Entraînement du classifieur

In [129]:
model = BertForTokenClassification.from_pretrained(checkpoint, 
                                                    num_labels=3,
                                                    id2label=id2label, 
                                                    label2id=label2id)

loading configuration file config.json from cache at C:\Users\aengp/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B-NPI",
    "1": "I-NPI",
    "2": "O"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-NPI": 0,
    "I-NPI": 1,
    "O": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at C:\Users\aen

In [132]:
training_args = TrainingArguments(
    output_dir="npi_bio_model",
    learning_rate=0.1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    max_grad_norm=10, # Gradient clipping
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True
)

optimizer = Adam(model.parameters(),
                  lr=0.1)

num_training_steps = training_args.num_train_epochs * len(processed_dataset["train"])

lr_scheduler = get_scheduler(name="linear", 
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
c:\Users\aengp\Desktop\memoire\NPI_memoire\test\npi_bio_model is already a clone of https://huggingface.co/delphine-nguyen/npi_bio_model. Make sure you pull the latest changes with `repo.git_pull()`.
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8
  Num Epochs = 20
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 40
  Num

ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)},
Input predictions: [[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]],
Input references: [[-100    2    2    2    0    2 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100]
 [-100    2    2    2    0    2    2    2    2    2    2 -100 -100 -100
  -100 -100 -100 -100 -100 -100]]

In [None]:
trainer.push_to_hub()

NameError: name 'trainer' is not defined

# Utilisation du modèle

In [None]:
input = "Did you eat anything today?"

classifier = pipeline("ner", model="delphine-nguyen/npi_bio_model")
classifier(input)