# Get Data

téléchargez le json https://huggingface.co/datasets/Universal-NER/Pile-NER-type/blob/main/train.json
mettez le au meme niveau que ce script
puis éxécutez une fois le bloc suivant puis commentez le

In [1]:
import json
import re
import ast
from tqdm import tqdm

def load_data(filepath):
    """Loads data from a JSON file."""
    with open(filepath, 'r') as f:
        data = json.load(f)
    return data

def tokenize_text(text):
    """Tokenizes the input text into a list of tokens."""
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def extract_entity_spans(entry):
    """Extracts entity spans from an entry."""
    len_start = len("What describes ")
    len_end = len(" in the text?")
    entity_types, entity_texts, negative = [], [], []

    for c in entry['conversations']:
        if c['from'] == 'human' and c['value'].startswith('Text: '):
            text = c['value'][len('Text: '):]
            tokenized_text = tokenize_text(text)
        elif c['from'] == 'human' and c['value'].startswith('What describes '):
            entity_type = c['value'][len_start:-len_end]
            entity_types.append(entity_type)
        elif c['from'] == 'gpt' and c['value'].startswith('['):
            if c['value'] == '[]':
                negative.append(entity_types.pop())
                continue
            texts_ents = ast.literal_eval(c['value'])
            entity_texts.extend(texts_ents)
            num_repeat = len(texts_ents) - 1
            entity_types.extend([entity_types[-1]] * num_repeat)

    entity_spans = []
    for j, entity_text in enumerate(entity_texts):
        entity_tokens = tokenize_text(entity_text)
        matches = []
        for i in range(len(tokenized_text) - len(entity_tokens) + 1):
            if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower():
                matches.append((i, i + len(entity_tokens) - 1, entity_types[j]))
        if matches:
            entity_spans.extend(matches)

    return {"tokenized_text": tokenized_text, "ner": entity_spans, "negative": negative}

def process_data(data):
    """Processes a list of data entries to extract entity spans."""
    all_data = [extract_entity_spans(entry) for entry in tqdm(data)]
    return all_data

def save_data_to_file(data, filepath):
    """Saves the processed data to a JSON file."""
    with open(filepath, 'w') as f:
        json.dump(data, f)

if __name__ == "__main__":
    # download the pile-ner data: "wget https://huggingface.co/datasets/Universal-NER/Pile-NER-type/blob/main/train.json"
    path_pile_ner = 'train.json'
    data = load_data(path_pile_ner)
    processed_data = process_data(data)
    save_data_to_file(processed_data, 'pilener_train.json')
    print(data[0])
    print("dataset size:", len(processed_data))

100%|██████████| 45889/45889 [01:08<00:00, 672.41it/s]


{'id': 'ner_0', 'conversations': [{'from': 'human', 'value': 'Text: Q:\n\nPosition character based on enemy coordinates in lua\n\nI have written a function here which should turn my character based on enemy coordinates but it\'s not perfect because it does not always turn where I want it to and perhaps there is a better way of writing it\nlocal myPosition = {x = 350, y = 355}\nlocal enemyPosition = {x = 352, y = 354}\nlocal xValue, yValue, xDir, yDir, dir\n\nif myPosition.x > enemyPosition.x then\n    xValue = myPosition.x - enemyPosition.x\nelseif myPosition.x < enemyPosition.x then\n    xValue = myPosition.x - enemyPosition.x\nelse\n    xValue = 0\nend\n\nif myPosition.y > enemyPosition.y then\n    yValue = myPosition.y - enemyPosition.y\nelseif myPosition.y < enemyPosition.y then\n    yValue = myPosition.y - enemyPosition.y\nelse\n    yValue = 0\nend\n\nif xValue < 0 then\n    xDir = "TURN RIGHT"\nelseif xValue > 0 then\n    xDir = "TURN LEFT"\nend\n\nif yValue < 0 then\n    yDir = 

# Prepare data

In [1]:
from transformers import BertTokenizer
import torch
from tqdm import tqdm
import json

def prepare_data_for_training(processed_data, tokenizer, max_length=128):
    input_ids, attention_masks, labels = [], [], []

    for entry in tqdm(processed_data, desc="Processing Data", unit="entry"):
        tokenized_text = entry["tokenized_text"]
        ner_spans = entry["ner"]
        
        # Tokenize the entire text
        encoded = tokenizer(" ".join(tokenized_text), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
        
        # Generate label tensor
        label_tensor = torch.zeros(max_length, dtype=torch.long)
        for start, end, entity_type in ner_spans:
            if start < max_length and end < max_length:
                label_tensor[start:end + 1] = 1  # Mark entity spans as positive (or specific to entity types if needed)
        
        input_ids.append(encoded["input_ids"][0])
        attention_masks.append(encoded["attention_mask"][0])
        labels.append(label_tensor)

    return torch.stack(input_ids), torch.stack(attention_masks), torch.stack(labels)

# Charger les données générées précédemment
with open('pilener_train.json', 'r') as f:
    processed_data = json.load(f)

# Charger le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Préparer les données avec suivi d'avancement
input_ids, attention_masks, labels = prepare_data_for_training(processed_data, tokenizer)


Processing Data: 100%|██████████| 45889/45889 [02:58<00:00, 256.69entry/s]


Structure des données
Les données retournées par prepare_data_for_training sont des tensors PyTorch. Voici les détails :

input_ids : Contient les identifiants tokenisés correspondant au texte d'entrée.

Taille : (nombre_entrées, max_length)
Type : torch.Tensor (entiers longs)
Exemple : Les premiers identifiants d'entrée (transformés par le tokenizer).
attention_masks : Contient des masques indiquant les positions valides des tokens dans chaque séquence (1 pour les tokens valides, 0 pour le padding).

Taille : (nombre_entrées, max_length)
Type : torch.Tensor (entiers longs)
Exemple : Masque associé à input_ids.
labels : Contient les étiquettes pour les entités NER. Chaque position correspond à une classe (0 pour le non-entité, 1 ou plus pour des entités spécifiques selon vos définitions).

Taille : (nombre_entrées, max_length)
Type : torch.Tensor (entiers longs)
Exemple : Vecteur avec 0 (non-entité) ou 1+ (entités).

In [2]:
# Afficher les formes des tensors
print(f"Forme de input_ids : {input_ids.shape}")
print(f"Forme de attention_masks : {attention_masks.shape}")
print(f"Forme de labels : {labels.shape}")

# Afficher un exemple
print("\nExemple de input_ids (première entrée) :")
print(input_ids[0])

print("\nExemple de attention_masks (première entrée) :")
print(attention_masks[0])

print("\nExemple de labels (première entrée) :")
print(labels[0])


Forme de input_ids : torch.Size([45889, 128])
Forme de attention_masks : torch.Size([45889, 128])
Forme de labels : torch.Size([45889, 128])

Exemple de input_ids (première entrée) :
tensor([  101,  1053,  1024,  2597,  2839,  2241,  2006,  4099, 12093,  1999,
        11320,  2050,  1045,  2031,  2517,  1037,  3853,  2182,  2029,  2323,
         2735,  2026,  2839,  2241,  2006,  4099, 12093,  2021,  2009,  1005,
         1055,  2025,  3819,  2138,  2009,  2515,  2025,  2467,  2735,  2073,
         1045,  2215,  2009,  2000,  1998,  3383,  2045,  2003,  1037,  2488,
         2126,  1997,  3015,  2009,  2334,  2026, 26994,  1027,  1063,  1060,
         1027,  8698,  1010,  1061,  1027, 26271,  1065,  2334,  4099, 26994,
         1027,  1063,  1060,  1027, 28906,  1010,  1061,  1027, 27878,  1065,
         2334, 15566,  2389,  5657,  1010,  1061, 10175,  5657,  1010,  1060,
         4305,  2099,  1010, 21076,  4313,  1010, 16101,  2065,  2026, 26994,
         1012,  1060,  1028,  4099, 2

# Model train & test

In [17]:
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm  # Import de tqdm

class GLiNERModel(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased'):
        super(GLiNERModel, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
        self.bert = BertModel.from_pretrained(pretrained_model)
        
        # Ajuster la couche pour gérer les dimensions concaténées (2 * hidden_size)
        self.ffn = nn.Sequential(
            nn.Linear(2 * self.bert.config.hidden_size, 768),  # 2x car on concatène deux embeddings
            nn.ReLU(),
            nn.Linear(768, 1)  # Pour générer un score pour chaque span
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        span_scores = self.compute_span_scores(hidden_states)  # Tenseur (batch_size, seq_len, seq_len)
        return span_scores

    def compute_span_scores(self, hidden_states):
        batch_size, seq_len, hidden_size = hidden_states.size()
        max_span_length = 12  # Limite de la longueur du span
        scores = torch.full((batch_size, seq_len, seq_len), float(0), device=hidden_states.device)

        for b in range(batch_size):  # Parcourir chaque exemple dans le batch
            for i in range(seq_len):  # Pour chaque token dans la séquence
                for j in range(i + 1, min(i + max_span_length + 1, seq_len)):  # Limiter la longueur du span
                    span_embedding = torch.cat((hidden_states[b, i], hidden_states[b, j]), dim=-1)
                    span_score = self.ffn(span_embedding)  # Calculer le score du span
                    scores[b, i, j] = span_score.squeeze(-1)  # Stocker le score dans le tenseur
        return scores


# Créer un DataLoader (assurez-vous que input_ids, attention_masks et labels sont définis)
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

# Charger uniquement une portion des données
portion = 0.005  # Utilisez 10% des données
subset_size = int(len(dataset) * portion)
mini_dataset = TensorDataset(
    input_ids[:subset_size], attention_masks[:subset_size], labels[:subset_size]
)

# Créer un DataLoader pour le mini-dataset
mini_dataloader = DataLoader(mini_dataset, batch_size=1, shuffle=False)


# Initialiser le modèle et les paramètres d'entraînement
model = GLiNERModel(pretrained_model='bert-base-uncased')
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Boucle d'entraînement ajustée
epochs = 10
# Boucle d'entraînement ajustée avec mini_dataloader
for epoch in range(epochs):
    model.train()
    total_loss = 0

    print(f"Epoch {epoch + 1}/{epochs}:")
    with tqdm(total=len(mini_dataloader), desc="Training") as pbar:
        for batch in mini_dataloader:
            batch_input_ids, batch_attention_masks, batch_labels = [b.to(device) for b in batch]
            
            # Forward pass
            outputs = model(batch_input_ids, batch_attention_masks)
            
            # Réorganiser les dimensions pour correspondre aux labels
            outputs = outputs.view(-1, outputs.size(-1))
            batch_labels = batch_labels.view(-1)
            
            # Calculer la perte
            loss = criterion(outputs, batch_labels)
            total_loss += loss.item()
            
            # Backward pass et optimisation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Mettre à jour la barre de progression
            pbar.update(1)
            pbar.set_postfix({"loss": loss.item()})

    print(f"Epoch {epoch + 1}/{epochs} - Average Loss: {total_loss / len(mini_dataloader)}")

print("Entraînement terminé !")


Epoch 1/10:


Training: 100%|██████████| 229/229 [03:36<00:00,  1.06it/s, loss=4.76]


Epoch 1/10 - Average Loss: 4.759223673541473
Epoch 2/10:


Training: 100%|██████████| 229/229 [03:36<00:00,  1.06it/s, loss=4.76]


Epoch 2/10 - Average Loss: 4.752854284761254
Epoch 3/10:


Training: 100%|██████████| 229/229 [03:36<00:00,  1.06it/s, loss=4.76]


Epoch 3/10 - Average Loss: 4.752221913316885
Epoch 4/10:


Training: 100%|██████████| 229/229 [03:36<00:00,  1.06it/s, loss=4.76]


Epoch 4/10 - Average Loss: 4.7512025000226545
Epoch 5/10:


Training: 100%|██████████| 229/229 [03:37<00:00,  1.05it/s, loss=4.76]


Epoch 5/10 - Average Loss: 4.753131791493778
Epoch 6/10:


Training: 100%|██████████| 229/229 [03:38<00:00,  1.05it/s, loss=4.76]


Epoch 6/10 - Average Loss: 4.7504627152821906
Epoch 7/10:


Training: 100%|██████████| 229/229 [03:36<00:00,  1.06it/s, loss=4.76]


Epoch 7/10 - Average Loss: 4.75010921861407
Epoch 8/10:


Training: 100%|██████████| 229/229 [03:32<00:00,  1.08it/s, loss=4.76]


Epoch 8/10 - Average Loss: 4.749693093862075
Epoch 9/10:


Training: 100%|██████████| 229/229 [03:31<00:00,  1.08it/s, loss=4.76]


Epoch 9/10 - Average Loss: 4.749861021749838
Epoch 10/10:


Training: 100%|██████████| 229/229 [03:32<00:00,  1.08it/s, loss=4.76]

Epoch 10/10 - Average Loss: 4.749651105122795
Entraînement terminé !





In [18]:
def test_model(model, dataloader, device):
    model.eval()  # Passer en mode évaluation
    print("\n=== Test du modèle ===")
    
    with torch.no_grad():  # Désactiver la backpropagation
        for batch in dataloader:
            batch_input_ids, batch_attention_masks, batch_labels = [b.to(device) for b in batch]
            
            # Obtenir les scores des spans
            outputs = model(batch_input_ids, batch_attention_masks)
            
            # Afficher les dimensions des scores
            print("Dimensions des scores :", outputs.size())  # (batch_size, seq_len, seq_len)
            
            # Afficher quelques exemples de scores
            batch_size, seq_len, _ = outputs.size()
            for b in range(batch_size):
                print(f"\nExemple {b+1} :")
                print("Scores des spans (troncature) :")
                for i in range(min(seq_len, 5)):  # Limiter l'affichage à 5 tokens pour la lisibilité
                    print(outputs[b, i, :min(seq_len, 5)].cpu().numpy())
            
            # Tester une prédiction simple : trouver le span avec le score maximal
            max_scores, indices = torch.max(outputs.view(batch_size, -1), dim=1)
            start_indices = indices // seq_len
            end_indices = indices % seq_len
            print("\nIndices prédits pour les spans :")
            for b in range(batch_size):
                print(f"Exemple {b+1} : Start={start_indices[b].item()}, End={end_indices[b].item()}, Score={max_scores[b].item()}")
            
            break  # Tester uniquement sur le premier batch pour l'instant

# Appel du test après l'entraînement
test_model(model, mini_dataloader, device)
predicted_text = model.tokenizer.decode(batch_input_ids[0, 80:91].tolist())
print(f"Texte prédit pour le meilleur span : {predicted_text}")



=== Test du modèle ===
Dimensions des scores : torch.Size([1, 128, 128])

Exemple 1 :
Scores des spans (troncature) :
[ 0.        -4.4619265 -4.7122035 -4.380763  -4.728676 ]
[  0.         0.       -14.446016 -14.31452  -14.511815]
[  0.          0.          0.        -14.1396475 -14.337201 ]
[  0.         0.         0.         0.       -14.261844]
[0. 0. 0. 0. 0.]

Indices prédits pour les spans :
Exemple 1 : Start=0, End=0, Score=0.0
Texte prédit pour le meilleur span : ##f ) or azathioprine, often associated with


In [None]:
# import torch
# from transformers import BertTokenizer, BertModel
# import torch.nn as nn

# class GLiNERModel(nn.Module):
#     def __init__(self, pretrained_model='bert-base-uncased'):
#         super(GLiNERModel, self).__init__()
#         self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
#         self.bert = BertModel.from_pretrained(pretrained_model)
        
#         # Ajuster la couche pour gérer les dimensions concaténées (2 * hidden_size)
#         self.ffn = nn.Sequential(
#             nn.Linear(2 * self.bert.config.hidden_size, 768),  # 2x car on concatène deux embeddings
#             nn.ReLU(),
#             nn.Linear(768, 1)  # Pour générer un score pour chaque span
#         )

#     def forward(self, input_ids, attention_mask):
#         # Encodage du texte avec BERT
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         hidden_states = outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
        
#         # Représentation des spans
#         span_scores = self.compute_span_scores(hidden_states)
        
#         return span_scores

#     def compute_span_scores(self, hidden_states):
#         """
#         Cette fonction calcule la représentation des spans et leur score.
#         Pour simplifier, nous utilisons les premiers et derniers tokens du span pour calculer le score.
#         """
#         batch_size, seq_len, hidden_size = hidden_states.size()
#         span_scores = []
        
#         for b in range(batch_size):  # Parcourir chaque exemple dans le batch
#             example_scores = []
#             for i in range(seq_len - 1):  # Pour chaque token dans la séquence
#                 for j in range(i + 1, min(i + 13, seq_len)):  # Limiter la longueur du span
#                     span_embedding = torch.cat((hidden_states[b, i], hidden_states[b, j]), dim=-1)
#                     span_score = self.ffn(span_embedding)  # Calculer le score du span
#                     example_scores.append(span_score)
#             span_scores.append(torch.stack(example_scores))
        
#         return span_scores

# # Exemple d'utilisation du modèle avec un input fictif
# model = GLiNERModel()

# # Exemple d'input
# text = ["Alain Farley works at McGill University"]
# inputs = model.tokenizer(text, return_tensors='pt', padding=True, truncation=True)

# # Forward pass
# span_scores = model(inputs['input_ids'], inputs['attention_mask'])

# print(span_scores)
# # Afficher les scores des spans
# for i, scores in enumerate(span_scores):
#     print(f"Example {i}: {scores.shape}")




[tensor([[-0.1311],
        [-0.0504],
        [-0.0652],
        [-0.2085],
        [-0.1837],
        [-0.1692],
        [-0.1602],
        [-0.1907],
        [-0.0938],
        [-0.1070],
        [-0.2364],
        [-0.2298],
        [-0.1192],
        [-0.1750],
        [-0.2772],
        [-0.0874],
        [-0.2078],
        [-0.2143],
        [-0.1640],
        [-0.2102],
        [-0.2847],
        [-0.1154],
        [-0.0959],
        [-0.0901],
        [-0.0634],
        [-0.1718],
        [-0.0674],
        [-0.0672],
        [-0.0182],
        [-0.1385],
        [-0.1543],
        [-0.0863],
        [-0.1317],
        [-0.1460],
        [-0.1849],
        [-0.1721]], grad_fn=<StackBackward0>)]
Example 0: torch.Size([36, 1])
