In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import re 


model = BertForSequenceClassification.from_pretrained("./fine_tuned_mlm_model", num_labels=2)  # Assurez-vous que num_labels correspond à votre tâche
tokenizer = BertTokenizer.from_pretrained("./fine_tuned_mlm_model")
# Charger votre dataset
# Remplacez par le chargement de votre propre dataset
dataset = load_dataset('SALT-NLP/silent_signals_detection')

# Tokeniser le dataset
def clean_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer les caractères spéciaux et les chiffres
    text = re.sub(r'[^a-zA-Z\s]', '', text) # ? 
    # Supprimer les espaces superflus
    text = re.sub(r'\s+', ' ', text).strip()
    return text
    
def tokenize_function(examples):
    # Appliquer le nettoyage du texte
    examples['example'] = [clean_text(text) for text in examples['example']]
    # Tokeniser le texte nettoyé
    return tokenizer(examples['example'], padding="max_length", truncation=True)

def transform_label(example):
    example['label'] = 1 if example['label'] == "coded" else 0
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(transform_label)



# Préparer les données pour l'entraînement
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(50))  # Exemple avec 1000 échantillons
eval_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10))  # Exemple avec 500 échantillons

# Configurer les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Lancer l'entraînement
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./fine_tuned_mlm_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 9.50 GiB of which 4.94 MiB is free. Process 2908356 has 76.00 MiB memory in use. Process 2966018 has 644.00 MiB memory in use. Process 2967470 has 9.11 GiB memory in use. Process 2968448 has 366.00 MiB memory in use. Of the allocated memory 272.28 MiB is allocated by PyTorch, and 19.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [95]:
filtered_dataset = tokenized_datasets.filter(lambda example: example["label"] == 1)
dict_ingroup = {k:i for i,k in enumerate(set(filtered_dataset["train"]["ingroup"]))}
def update_label(example):
    example["label"] = dict_ingroup[example["ingroup"]]
    return example

filtered_dataset = filtered_dataset.map(update_label)

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [110]:
# Charger le tokenizer et le modèle BERT pré-entraîné
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Charger votre dataset

filtered_dataset = tokenized_datasets.filter(lambda example: example["label"] == 1)
dict_ingroup = {k:i for i,k in enumerate(set(filtered_dataset["train"]["ingroup"]))}
inv_dict_ingroup ={v:k for (k,v) in dict_ingroup.items()}

def update_label(example):
    example["label"] = dict_ingroup[example["ingroup"]]
    return example

filtered_dataset = filtered_dataset.map(update_label)
num_classes = len(dict_ingroup)

model_2 = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)  # Assurez-vous que num_labels correspond à votre tâche



# Préparer les données pour l'entraînement
train_dataset = filtered_dataset["train"].shuffle(seed=42).select(range(50))  # Exemple avec 1000 échantillons
eval_dataset = filtered_dataset["train"].shuffle(seed=42).select(range(10))  # Exemple avec 500 échantillons

# Configurer les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialiser le Trainer
trainer = Trainer(
    model=model_2,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Lancer l'entraînement
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.025656
2,No log,1.981113
3,No log,1.960831


TrainOutput(global_step=12, training_loss=2.1223891576131186, metrics={'train_runtime': 24.0284, 'train_samples_per_second': 6.243, 'train_steps_per_second': 0.499, 'total_flos': 39469138790400.0, 'train_loss': 2.1223891576131186, 'epoch': 3.0})

In [126]:
import torch
from transformers import pipeline

# Exemple de phrase à tester
test_sentence = "Lesbian GC stuff will get wiped as well because queer content gets flagged as NSFW. Idiots. They’d cut off their own nose to spite the face."

test_sentence = clean_text(test_sentence)
# Créer une pipeline de classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Faire une prédiction
prediction = classifier(test_sentence)

# Afficher le résultat


if prediction[0]["label"]=="LABEL_1":
    print("The sentence seems to be using a dog-whistle")
    classifier2 = pipeline('text-classification', model=model_2, tokenizer=tokenizer)
    pred2 = classifier2(test_sentence)
    label = int(pred2[0]["label"][-1])
    print("The dogwhistle been used is :",inv_dict_ingroup[label])

else:
    print("There does not seem to be a dog-whistle")


Device set to use cuda:0


There does not seem to be a dog-whistle
