In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

# Tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#  O (Outside), B-PER (Beginning-Person), I-PER (Inside-Person), B-DIS (Disease), B-MED (Medication)
label_map = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-DIS": 3,
    "B-MED": 4,
    "I-DIS": 5,
    "I-MED": 6,
}
num_labels = len(label_map)

num_labels = len(label_map)

# Exemple
texts = ["Alex Smith has been diagnosed with Hypertension and prescribed Lisinopril.",
         "Jordan Brown was given Metformin for Type 2 Diabetes.", "Marie Curie was treated with Radiotherapy for her condition.",
    "Thomas Edison has been experiencing severe headaches and was prescribed Ibuprofen.",
    "Nikola Tesla showed symptoms of insomnia and received Zolpidem.",
    "Albert Einstein was diagnosed with Dyslexia and advised to follow special educational strategies.",
    "Isaac Newton suffered from chronic pain and used Morphine."]

labels = [["B-PER", "I-PER", "O", "O", "O", "O", "B-DIS", "O", "O", "B-MED", "O"],
          ["B-PER", "I-PER", "O", "O", "B-MED", "O", "B-DIS", "I-DIS", "O"], ["B-PER", "I-PER", "O", "O", "O", "B-DIS", "O", "O", "O"],
    ["B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "B-MED", "O"],
    ["B-PER", "I-PER", "O", "O", "O", "B-DIS", "O", "O", "B-MED", "O"],
    ["B-PER", "I-PER", "O", "O", "B-DIS", "O", "O", "O", "O", "O", "O"],
    ["B-PER", "I-PER", "O", "O", "B-DIS", "O", "O", "B-MED", "O"]]

# Préparation des données
class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, label_map, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.label_map = label_map
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        word_labels = self.labels[idx]

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        labels = [self.label_map[label] for label in word_labels] + [self.label_map["O"]] * (self.max_len - len(word_labels))

        item = {key: torch.tensor(val[0]) for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels[:self.max_len], dtype=torch.long)
        return item

# Création du DataLoader
dataset = NERDataset(texts, labels, tokenizer, label_map)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [None]:
from transformers import BertModel
from torch import nn

class BertForNER(nn.Module):
    def __init__(self, num_labels):
        super(BertForNER, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # La dimension d'entrée de la couche linéaire est la taille cachée de BERT (768 pour bert-base-uncased)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        logits = self.classifier(sequence_output)
        return logits

model = BertForNER(num_labels=num_labels)


In [None]:
from torch.optim import AdamW
from tqdm import tqdm  # pour des barres de progression

# optimiseur
optimizer = AdamW(model.parameters(), lr=5e-5)

# Boucle d'entraînement
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)

        loss_fct = nn.CrossEntropyLoss()

        loss = loss_fct(outputs.view(-1, num_labels), labels.view(-1))

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss/len(dataloader)}")


  item = {key: torch.tensor(val[0]) for key, val in encoding.items()}
100%|██████████| 4/4 [00:48<00:00, 12.07s/it]


Epoch 1/3, Loss: 0.9773187972605228


100%|██████████| 4/4 [00:42<00:00, 10.73s/it]


Epoch 2/3, Loss: 0.05642773490399122


100%|██████████| 4/4 [00:41<00:00, 10.47s/it]

Epoch 3/3, Loss: 0.04429928492754698





In [None]:
def predict_entities(text, model, tokenizer, label_map):
    model.eval()
    tokens = tokenizer.tokenize(text)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor([input_ids]).to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=(input_ids != 0).unsqueeze(0))
        logits = outputs[0]

    # labels_idx est unidimensionnel
    labels_idx = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()  # Utilisez squeeze() pour réduire les dimensions superflues

    # S'assurer que labels_idx est itérable (en le convertissant en liste)
    if labels_idx.ndim == 0:
        labels_idx = [labels_idx]

    # indices en labels
    inv_label_map = {v: k for k, v in label_map.items()}
    predicted_labels = [inv_label_map[idx] for idx in labels_idx]

    # Associer les tokens aux labels prédits
    return list(zip(tokens, predicted_labels))


# Exemple d'utilisation
text = "Charlie Parker was prescribed Aspirin for his headaches."
predicted_entities = predict_entities(text, model, tokenizer, label_map)
print(predicted_entities)


[('charlie', 'O'), ('parker', 'O'), ('was', 'O'), ('prescribed', 'O'), ('as', 'O'), ('##pi', 'O'), ('##rin', 'O'), ('for', 'O'), ('his', 'O'), ('headache', 'O'), ('##s', 'O'), ('.', 'O')]
