# Get Data

téléchargez le json https://huggingface.co/datasets/Universal-NER/Pile-NER-type/blob/main/train.json
mettez le au meme niveau que ce script
puis éxécutez une fois le bloc suivant puis commentez le

In [1]:
# import json
# import re
# import ast
# from tqdm import tqdm

# def load_data(filepath):
#     """Loads data from a JSON file."""
#     with open(filepath, 'r') as f:
#         data = json.load(f)
#     return data

# def tokenize_text(text):
#     """Tokenizes the input text into a list of tokens."""
#     return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

# def extract_entity_spans(entry):
#     """Extracts entity spans from an entry."""
#     len_start = len("What describes ")
#     len_end = len(" in the text?")
#     entity_types, entity_texts, negative = [], [], []

#     for c in entry['conversations']:
#         if c['from'] == 'human' and c['value'].startswith('Text: '):
#             text = c['value'][len('Text: '):]
#             tokenized_text = tokenize_text(text)
#         elif c['from'] == 'human' and c['value'].startswith('What describes '):
#             entity_type = c['value'][len_start:-len_end]
#             entity_types.append(entity_type)
#         elif c['from'] == 'gpt' and c['value'].startswith('['):
#             if c['value'] == '[]':
#                 negative.append(entity_types.pop())
#                 continue
#             texts_ents = ast.literal_eval(c['value'])
#             entity_texts.extend(texts_ents)
#             num_repeat = len(texts_ents) - 1
#             entity_types.extend([entity_types[-1]] * num_repeat)

#     entity_spans = []
#     for j, entity_text in enumerate(entity_texts):
#         entity_tokens = tokenize_text(entity_text)
#         matches = []
#         for i in range(len(tokenized_text) - len(entity_tokens) + 1):
#             if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower():
#                 matches.append((i, i + len(entity_tokens) - 1, entity_types[j]))
#         if matches:
#             entity_spans.extend(matches)

#     return {"tokenized_text": tokenized_text, "ner": entity_spans, "negative": negative}

# def process_data(data):
#     """Processes a list of data entries to extract entity spans."""
#     all_data = [extract_entity_spans(entry) for entry in tqdm(data)]
#     return all_data

# def save_data_to_file(data, filepath):
#     """Saves the processed data to a JSON file."""
#     with open(filepath, 'w') as f:
#         json.dump(data, f)

# if __name__ == "__main__":
#     # download the pile-ner data: "wget https://huggingface.co/datasets/Universal-NER/Pile-NER-type/blob/main/train.json"
#     path_pile_ner = 'train.json'
#     data = load_data(path_pile_ner)
#     processed_data = process_data(data)
#     save_data_to_file(processed_data, 'pilener_train.json')
#     print(data[0])
#     print("dataset size:", len(processed_data))

# Prepare data

In [2]:
from transformers import BertTokenizer
import torch
from tqdm import tqdm
import json

def prepare_data_for_training(processed_data, tokenizer, max_length=128):
    # Créer un mapping des types d'entités vers des entiers
    entity_types = {entity for entry in processed_data for _, _, entity in entry["ner"]}
    entity_to_id = {entity: idx + 1 for idx, entity in enumerate(entity_types)}  # +1 pour que 0 reste le label par défaut (non-entité)

    input_ids, attention_masks, labels = [], [], []

    for entry in tqdm(processed_data, desc="Processing Data", unit="entry"):
        tokenized_text = entry["tokenized_text"]
        ner_spans = entry["ner"]
        
        # Tokenize the entire text
        encoded = tokenizer(" ".join(tokenized_text), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
        
        # Generate label tensor
        label_tensor = torch.zeros(max_length, dtype=torch.long)
        for start, end, entity_type in ner_spans:
            if start < max_length and end < max_length and entity_type in entity_to_id:
                label_tensor[start:end + 1] = entity_to_id[entity_type]  # Mapper vers un entier correspondant au type d'entité
        
        input_ids.append(encoded["input_ids"][0])
        attention_masks.append(encoded["attention_mask"][0])
        labels.append(label_tensor)

    return torch.stack(input_ids), torch.stack(attention_masks), torch.stack(labels), entity_to_id

# Charger les données générées précédemment
with open('pilener_train.json', 'r') as f:
    processed_data = json.load(f)

# Charger le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Préparer les données avec suivi d'avancement
input_ids, attention_masks, labels, entity_to_id = prepare_data_for_training(processed_data, tokenizer)

# Afficher le mapping des entités
print("Mapping des entités vers des IDs :", entity_to_id)


Processing Data: 100%|██████████| 45889/45889 [02:59<00:00, 256.13entry/s]




Structure des données
Les données retournées par prepare_data_for_training sont des tensors PyTorch. Voici les détails :

input_ids : Contient les identifiants tokenisés correspondant au texte d'entrée.

Taille : (nombre_entrées, max_length)
Type : torch.Tensor (entiers longs)
Exemple : Les premiers identifiants d'entrée (transformés par le tokenizer).
attention_masks : Contient des masques indiquant les positions valides des tokens dans chaque séquence (1 pour les tokens valides, 0 pour le padding).

Taille : (nombre_entrées, max_length)
Type : torch.Tensor (entiers longs)
Exemple : Masque associé à input_ids.
labels : Contient les étiquettes pour les entités NER. Chaque position correspond à une classe (0 pour le non-entité, 1 ou plus pour des entités spécifiques selon vos définitions).

Taille : (nombre_entrées, max_length)
Type : torch.Tensor (entiers longs)
Exemple : Vecteur avec 0 (non-entité) ou 1+ (entités).

In [3]:
# Afficher les formes des tensors
print(f"Forme de input_ids : {input_ids.shape}")
print(f"Forme de attention_masks : {attention_masks.shape}")
print(f"Forme de labels : {labels.shape}")

# Afficher un exemple
print("\nExemple de input_ids (première entrée) :")
print(input_ids[0])

print("\nExemple de attention_masks (première entrée) :")
print(attention_masks[0])

print("\nExemple de labels (première entrée) :")
print(labels[0])


Forme de input_ids : torch.Size([45889, 128])
Forme de attention_masks : torch.Size([45889, 128])
Forme de labels : torch.Size([45889, 128])

Exemple de input_ids (première entrée) :
tensor([  101,  1053,  1024,  2597,  2839,  2241,  2006,  4099, 12093,  1999,
        11320,  2050,  1045,  2031,  2517,  1037,  3853,  2182,  2029,  2323,
         2735,  2026,  2839,  2241,  2006,  4099, 12093,  2021,  2009,  1005,
         1055,  2025,  3819,  2138,  2009,  2515,  2025,  2467,  2735,  2073,
         1045,  2215,  2009,  2000,  1998,  3383,  2045,  2003,  1037,  2488,
         2126,  1997,  3015,  2009,  2334,  2026, 26994,  1027,  1063,  1060,
         1027,  8698,  1010,  1061,  1027, 26271,  1065,  2334,  4099, 26994,
         1027,  1063,  1060,  1027, 28906,  1010,  1061,  1027, 27878,  1065,
         2334, 15566,  2389,  5657,  1010,  1061, 10175,  5657,  1010,  1060,
         4305,  2099,  1010, 21076,  4313,  1010, 16101,  2065,  2026, 26994,
         1012,  1060,  1028,  4099, 2

# Model train & test

In [4]:
proportion = 0.1

In [5]:
from torch.utils.data import DataLoader, Dataset, random_split

class NERDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.labels[idx]

ind = int(len(input_ids) * proportion)
# Create the dataset
dataset = NERDataset(input_ids[:ind], attention_masks[:ind], labels[:ind])

In [6]:
# Définir les proportions pour le train, validation et test
train_ratio = 0.8  # 80% des données pour l'entraînement
val_ratio = 0.1    # 10% des données pour la validation
test_ratio = 0.1   # 10% des données pour le test

# Calculer les tailles des différents ensembles
train_size = int(train_ratio * len(dataset))
val_size = int(val_ratio * len(dataset))
test_size = len(dataset) - train_size - val_size

# Diviser les données en train, validation, et test
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Créer les DataLoaders pour chaque ensemble
batch_size = 1  # Ajuster selon vos besoins
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Vérification des tailles
print(f"Train size: {len(train_loader.dataset)}")
print(f"Validation size: {len(val_loader.dataset)}")
print(f"Test size: {len(test_loader.dataset)}")


Train size: 3670
Validation size: 458
Test size: 460


In [7]:
from transformers import BertModel
import torch.nn as nn

class NERModel(nn.Module):
    def __init__(self, bert_model_name, num_labels):
        super(NERModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output)
        return logits


In [None]:
from tqdm import tqdm
from torch.optim import AdamW
import torch
import torch.nn as nn

def train_model(model, train_loader, val_loader, num_epochs, learning_rate, device):
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # 0 pour ignorer les non-entités
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        
        # Progress bar pour le train_loader
        train_progress = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Training", unit="batch")
        for input_ids, attention_mask, labels in train_progress:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            if torch.unique(labels).size(0) == 1 and torch.unique(labels)[0] == 0:
                continue

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits.view(-1, logits.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_progress.set_postfix(loss=loss.item())

            # if torch.isnan(loss):
            #     print("Loss is NaN! Debugging:")
            #     print("Input IDs:", input_ids)
            #     print("Attention Mask:", attention_mask)
            #     print("Labels:", labels)
            #     print("Logits:", logits)
            #     break
                    
        print(f"Epoch {epoch + 1}, Train Loss: {train_loss / len(train_loader):.4f}")
        
        validate_model(model, val_loader, device)

def validate_model(model, val_loader, device):
    model.eval()
    val_loss = 0
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    # Progress bar pour le val_loader
    val_progress = tqdm(val_loader, desc="Validation", unit="batch")
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_progress:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            # Vérification des labels pour éviter les NaN
            if torch.unique(labels).size(0) == 1 and torch.unique(labels)[0] == 0:
                continue
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits.view(-1, logits.shape[-1]), labels.view(-1))
            val_loss += loss.item()
            val_progress.set_postfix(loss=loss.item())
    
    # Calcul de la perte moyenne uniquement si des batchs valides ont été traités
    if len(val_loader) > 0:
        print(f"Validation Loss: {val_loss / len(val_loader):.4f}")
    else:
        print("No valid batches processed in validation.")



In [9]:
from sklearn.metrics import classification_report

def test_model(model, test_loader, entity_to_id, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            logits = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=-1)
            
            # Collecter les prédictions et labels non masqués
            mask = attention_mask.bool()
            all_preds.extend(predictions[mask].cpu().numpy())
            all_labels.extend(labels[mask].cpu().numpy())

    # Identifier les classes réellement utilisées dans vos données
    used_classes = sorted(set(all_labels))
    used_target_names = [list(entity_to_id.keys())[cls] for cls in used_classes]

    # Passer les classes utilisées à classification_report
    print(classification_report(
        all_labels,
        all_preds,
        target_names=used_target_names,
        labels=used_classes
    ))



In [10]:
num_labels = len(entity_to_id) + 1  # inclure le label de fond
model = NERModel('bert-base-uncased', num_labels)

train_model(model, train_loader, test_loader, num_epochs=1, learning_rate=5e-5, device='cuda')

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch 1/1 - Training: 100%|██████████| 3670/3670 [03:23<00:00, 18.05batch/s, loss=9.4]  


Epoch 1, Train Loss: 5.5037


Validation: 100%|██████████| 460/460 [00:04<00:00, 100.68batch/s, loss=8.77] 


Validation Loss: nan


In [None]:
test_model(model, test_loader, entity_to_id, device='cuda')


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                     precision    recall  f1-score   support

                                    EventDispatcher       0.00      0.00      0.00    361472
                                   allele frequency       0.00      0.00      0.00        10
                                       network data       0.00      0.00      0.00        66
                                          amplitude       0.00      0.00      0.00        12
                                         dictionary       0.00      0.00      0.00         2
                                             spacer       0.00      0.00      0.00         3
                                  Mythical Creature       0.00      0.00      0.00         1
                            environmental condition       0.00      0.00      0.00         2
                                unit test framework       0.00      0.00      0.00        11
                                      Type of Study       0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
