# Importation des modules

In [17]:
from transformers import pipeline
import csv
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = "bert-base-uncased"

# Pre-processing

## Importation des données

In [2]:
tokens = []
labels = []

with open("../data/fake_dataset.csv") as csvfile:
    
    reader = csv.reader(csvfile, delimiter=",")
    
    temp_tokens = []
    temp_labels = []
    
    next(reader) # Ignore première ligne (en-tête)
    
    for row in reader:
        
        if row == ["#", "#"]: # Nouvelle phrase
            
            # Ajout des listes tampon
            tokens.append(temp_tokens)
            labels.append(temp_labels)
            
            # On vide les listes tampon
            temp_tokens = []
            temp_labels = []
        
        else:
            temp_tokens.append(row[0])
            temp_labels.append(row[1])

## Mise en forme des données dans le bon format pour BERT

In [65]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

processed_tokens = []

for sentence in tokens:
    
    # Conversion des tokens en IDs
    # Et ajout des special tokens [CLS] et [SEP]
    input_ids = [tokenizer.cls_token_id] + [tokenizer.convert_tokens_to_ids(token) for token in sentence] + [tokenizer.sep_token_id]
    
    processed_tokens.append(torch.tensor(input_ids))
    
# Application du padding (ajout de tokens spéciaux pour que toutes les phrases aient la même longueur)
padded_tokens = torch.nn.utils.rnn.pad_sequence(processed_tokens, 
                                                batch_first=True,
                                                padding_value=tokenizer.pad_token_id)

# Création de l'attention mask
attention_mask = torch.zeros_like(padded_tokens) # Création d'un tensor rempli de 0 avec la même shape que padded_tokens
attention_mask[padded_tokens != tokenizer.pad_token_id] = 1 # Si le token est différent du token de padding, on met 1

# BERT

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, 
                                                           num_labels=3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
unmasker = pipeline("fill-mask", model = "bert-base-uncased")