In [1]:
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import re
import torch
import random 

# Charger le tokenizer et le modèle BERT pré-entraîné pour MLM
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Charger votre dataset
# Remplacez par le chargement de votre propre dataset
dataset = load_dataset('SALT-NLP/silent_signals_detection')

# Tokeniser le dataset
def clean_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer les caractères spéciaux et les chiffres
    text = re.sub(r'[^a-zA-Z\s]', '', text) # ? 
    # Supprimer les espaces superflus
    text = re.sub(r'\s+', ' ', text).strip()
    return text
    
def tokenize_function(examples):
    # Appliquer le nettoyage du texte
    examples['example'] = [clean_text(text) for text in examples['example']]
    # Tokeniser le texte nettoyé
    return tokenizer(examples['example'], padding="max_length", truncation=True)

def transform_label(example):
    example['label'] = 1 if example['label'] == "coded" else 0
    return example


def mask_tokens(examples, mlm_probability=0.15):
    examples['example'] = [clean_text(text) for text in examples['example']]

    tokenized_inputs = tokenizer(examples['example'], padding="max_length", truncation=True, return_tensors="pt")

    for i, tokens in enumerate(tokenized_inputs['input_ids']):
        # Masquer les "dog whistles"
        dw = examples['dog_whistle'][i]
        dw_ids = tokenizer(dw, add_special_tokens=False)['input_ids']
        for start_idx in range(len(tokens) - len(dw_ids) + 1):
            if tokens[start_idx:start_idx+len(dw_ids)].equal(torch.tensor(dw_ids)):
                for idx in range(start_idx, start_idx+len(dw_ids)):
                    tokens[idx] = tokenizer.mask_token_id

        # Masquer des tokens aléatoires
        for idx in range(len(tokens)):
            if random.random() < mlm_probability and tokens[idx] not in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]:
                tokens[idx] = tokenizer.mask_token_id

    return tokenized_inputs


dog_whistle_list = list(set(dataset["train"]["dog_whistle"]))


#tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = dataset.map(lambda examples: mask_tokens(examples), batched=True)
                                 
if 'label' in tokenized_datasets["train"].column_names:
    tokenized_datasets = tokenized_datasets.remove_columns('label')

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)


# Préparer les données pour l'entraînement
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(50))  # Exemple avec 1000 échantillons
eval_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10))  # Exemple avec 500 échantillons

# Configurer les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator = data_collator,
)

# Lancer l'entraînement
trainer.train()

model.save_pretrained("./fine_tuned_mlm_model")
tokenizer.save_pretrained("./fine_tuned_mlm_model")


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

Epoch,Training Loss,Validation Loss
1,No log,3.672476
2,No log,3.523716
3,No log,3.684331
4,No log,3.732661
5,No log,4.231956


('./fine_tuned_mlm_model/tokenizer_config.json',
 './fine_tuned_mlm_model/special_tokens_map.json',
 './fine_tuned_mlm_model/vocab.txt',
 './fine_tuned_mlm_model/added_tokens.json')

In [5]:
from transformers import BertTokenizer, BertForMaskedLM, pipeline
import torch


# Exemple de phrase
phrase = 'I have bad news for all the [MASK] and enablers—By the time I am done reporting on these “all ages” and “family friendly” drag shows in TX, they’re going to be illegal, and there is nothing you can do about it.'

# Tokeniser la phrase
inputs = tokenizer(phrase, return_tensors="pt")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
inputs = inputs.to(device)
# Obtenir les prédictions du modèle
with torch.no_grad():
    outputs = model(**inputs)

print(outputs)
# Obtenir les prédictions pour le token masqué
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
predicted_token_ids = torch.argmax(outputs.logits[0, mask_token_index], dim=1)
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Afficher les prédictions
print("Phrase d'origine :", phrase)
print("Prédictions :", predicted_tokens)

# Utiliser la pipeline pour une utilisation plus simple
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
print(fill_mask(phrase))



Device set to use cuda:0


MaskedLMOutput(loss=None, logits=tensor([[[ -6.8968,  -6.7949,  -6.8031,  ...,  -6.2194,  -6.1713,  -4.3155],
         [-12.8394, -12.3245, -12.4436,  ..., -10.3723, -10.9423, -10.4785],
         [-12.4126, -11.7922, -12.3494,  ...,  -9.5282, -12.4392, -10.6429],
         ...,
         [-10.0694, -10.0425,  -9.7964,  ...,  -9.1855,  -8.8852, -11.6839],
         [-15.9588, -15.5124, -15.5810,  ..., -11.4256, -12.2536, -13.1669],
         [-13.0300, -13.0519, -13.0548,  ...,  -9.5997,  -9.9166, -10.5098]]],
       device='cuda:0'), hidden_states=None, attentions=None)
Phrase d'origine : I have bad news for all the [MASK] and enablers—By the time I am done reporting on these “all ages” and “family friendly” drag shows in TX, they’re going to be illegal, and there is nothing you can do about it.
Prédictions : ['promoters']
[{'score': 0.25800225138664246, 'token': 26512, 'token_str': 'promoters', 'sequence': 'i have bad news for all the promoters and enablers — by the time i am done reporti

In [6]:
#MLM + classifier ? 
from datasets import load_dataset

dataset = load_dataset('SALT-NLP/silent_signals_detection')
list(set(dataset["train"]["dog_whistle"]))


['norm17',
 '#Milk',
 'norm24',
 'gibsmedat',
 'magapede',
 '#GenderWooWoo',
 'norm23',
 'norm20',
 'groomer',
 'norm8',
 'norm14',
 'soy',
 'coincidence',
 'norm4',
 'Zioworld',
 '109',
 'fatherless child',
 'Durden',
 'Rothschilds',
 'norm9',
 'alarmist',
 'jogger',
 'norm0',
 'norm15',
 'grooming',
 'bop',
 'fatherless',
 'steroids',
 'norm5',
 'majority-minority',
 'alt-right',
 'cabal',
 'terrorist',
 'norm10',
 'norm16',
 'norm2',
 'groomers',
 'norm19',
 'norm13',
 'anointed',
 'clownfish',
 'hygienic',
 'windmill',
 'nibba',
 'octopus',
 'norm22',
 'based',
 'joggers',
 'norm3',
 'Google',
 'globalism',
 'norm1',
 'norm7',
 'thug',
 'globalist',
 'autogynephilia',
 'genderfree',
 'Aiden',
 'terrorists',
 'norm12',
 'Reagan',
 'echo',
 'kosherist',
 'Islamists',
 'norm11',
 'Boogaloo',
 'Judeo-Christian',
 'TRAs',
 'Illuminati',
 'Pajeet',
 'intact',
 'fren',
 'norm6',
 'Kek',
 'YWNBAW',
 'womyn',
 'womanface',
 'Khazars',
 'COIN',
 'XX',
 'shoah',
 '23/16',
 'norm18',
 'identit