In [1]:
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset

In [2]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load dataset
dataset = load_dataset("Hate-speech-CNERG/hatexplain", trust_remote_code=True)

In [4]:
# Process the dataset
def process_example(example):
    # Use majority voting for the label
    label_counts = {}
    for label in example['annotators']['label']:
        label_counts[label] = label_counts.get(label, 0) + 1
    majority_label = max(label_counts, key=label_counts.get)
    
    # Combine tokens into a single string
    text = " ".join(example['post_tokens'])
    return {"text": text, "label": majority_label}

In [5]:
# Apply processing to the dataset
processed_dataset = dataset.map(process_example, remove_columns=dataset['train'].column_names)

In [6]:
# Inspect the processed dataset
print(processed_dataset['train'][0])

{'text': 'u really think i would not have been raped by feral hindu or muslim back in india or bangladesh and a neo nazi would rape me as well just to see me cry', 'label': 2}


In [7]:
# Tokenizer setup
model_name = "cardiffnlp/twitter-roberta-base-hate"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_datasets = processed_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Map:   0%|          | 0/15383 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

Map:   0%|          | 0/1924 [00:00<?, ? examples/s]

In [8]:
# Split datasets
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]
label_list = sorted(set(processed_dataset["train"]["label"]))

In [10]:
# Model setup
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list), ignore_mismatched_sizes=True)
model = model.to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # If logits are NumPy arrays:
    # predictions = np.argmax(logits, axis=-1)

    # If logits might be PyTorch tensors, convert them to NumPy:
    # (But usually, Hugging Face Trainer sends them as NumPy already)
    if isinstance(logits, torch.Tensor):
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    else:
        predictions = np.argmax(logits, axis=-1)

    # labels is already a NumPy array, so we can directly use it
    # If needed, ensure it's NumPy:
    # if isinstance(labels, torch.Tensor):
    #     labels = labels.cpu().numpy()

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [12]:
# Set up TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated from evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,  # Optional for best model selection
    fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU is available
)

In [13]:
# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7255,0.692744,0.7079,0.695825,0.699264,0.7079
2,0.5594,0.719441,0.697505,0.694036,0.692997,0.697505
3,0.5557,0.777436,0.699584,0.694072,0.691456,0.699584


TrainOutput(global_step=2886, training_loss=0.6200791054604703, metrics={'train_runtime': 152.8465, 'train_samples_per_second': 301.93, 'train_steps_per_second': 18.882, 'total_flos': 1476895697391996.0, 'train_loss': 0.6200791054604703, 'epoch': 3.0})

In [15]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.6927439570426941, 'eval_accuracy': 0.7079002079002079, 'eval_f1': 0.6958247005965756, 'eval_precision': 0.6992635581184986, 'eval_recall': 0.7079002079002079, 'eval_runtime': 1.4013, 'eval_samples_per_second': 1373.058, 'eval_steps_per_second': 86.351, 'epoch': 3.0}


In [16]:
# Save the model
save_path = f"./hate-speech-{model_name.replace('/', '-')}"  # Replace slashes in model name with dashes for valid paths
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to: {save_path}")

Model and tokenizer saved to: ./hate-speech-cardiffnlp-twitter-roberta-base-hate
