# Importation des modules

In [84]:
from transformers import pipeline
import csv
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import LabelEncoder

checkpoint = "bert-base-uncased"

# Pre-processing

## Importation des données

In [75]:
tokens = []
labels = []

with open("../data/fake_dataset.csv") as csvfile:
    
    reader = csv.reader(csvfile, delimiter=",")
    
    temp_tokens = []
    temp_labels = []
    
    next(reader) # Ignore première ligne (en-tête)
    
    for row in reader:
        
        if row == ["#", "#"]: # Nouvelle phrase
            
            # Ajout des listes tampon
            tokens.append(temp_tokens)
            labels.append(temp_labels)
            
            # On vide les listes tampon
            temp_tokens = []
            temp_labels = []
        
        else:
            temp_tokens.append(row[0])
            temp_labels.append(row[1])

In [88]:
label_encoder = LabelEncoder()
label_encoder.fit(["O", "B-NPI", "I-NPI"])

label_ids = [[label_encoder.transform(label) for label in sent_labels] for sent_labels in labels]

ValueError: y should be a 1d array, got an array of shape () instead.

## Mise en forme des données dans le bon format pour BERT

In [80]:
torch.tensor(labels)

ValueError: too many dimensions 'str'

In [78]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

processed_tokens = []

for sentence in tokens:
    
    # Conversion des tokens en IDs
    # Et ajout des special tokens [CLS] et [SEP]
    input_ids = [tokenizer.cls_token_id] + [tokenizer.convert_tokens_to_ids(token) for token in sentence] + [tokenizer.sep_token_id]
    
    processed_tokens.append(torch.tensor(input_ids))
    
# Application du padding (ajout de tokens spéciaux pour que toutes les phrases aient la même longueur)
padded_tokens = torch.nn.utils.rnn.pad_sequence(processed_tokens, 
                                                batch_first=True,
                                                padding_value=tokenizer.pad_token_id)

# Application du padding (ajout de tokens spéciaux pour que toutes les phrases aient la même longueur)
padded_labels = torch.nn.utils.rnn.pad_sequence(torch.tensor(labels), 
                                                batch_first=True,
                                                padding_value=tokenizer.pad_token_id)

# Création de l'attention mask
attention_mask = torch.zeros_like(padded_tokens) # Création d'un tensor rempli de 0 avec la même shape que padded_tokens
attention_mask[padded_tokens != tokenizer.pad_token_id] = 1 # Si le token est différent du token de padding, on met 1

ValueError: too many dimensions 'str'

In [73]:
padded_labels

NameError: name 'padded_labels' is not defined

# BERT

In [70]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, 
                                                           num_labels=3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [72]:
batch_size = 3

data = list(zip(processed_tokens, labels, attention_mask))
dataloader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)

# Set up the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Put the model into training mode
model.train()


# Loop through the batches
for batch in dataloader:

    # Extract the input IDs, token type IDs, and attention masks for this batch
    input_ids = batch[0]['input_ids']
    token_type_ids = batch[0]['token_type_ids']
    attention_mask = batch[2]
    
    # Pass the inputs through the BERT model to obtain hidden representations for each token
    with torch.no_grad():
        outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        hidden_states = outputs[0]
    
    # Flatten the hidden states tensor and extract the hidden states for non-padding tokens
    flattened_hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
    non_padding_mask = attention_mask.view(-1) == 1
    token_hidden_states = flattened_hidden_states[non_padding_mask]
    
    # Classify the tokens into one of the three classes
    logits = model.classifier(token_hidden_states)
    probabilities = torch.softmax(logits, dim=-1)
    predicted_labels = torch.argmax(probabilities, dim=-1)
    
    # Compute the loss
    true_labels = batch[1]
    non_padding_labels = true_labels[non_padding_mask]
    loss = loss_fn(logits, non_padding_labels)
    
    # Print the loss
    print(loss)

RuntimeError: stack expects each tensor to be equal size, but got [8] at entry 0 and [12] at entry 1

In [None]:
unmasker = pipeline("fill-mask", model = "bert-base-uncased")