In [None]:
import os
cachedir = './cache'
os.environ["TRANSFORMERS_CACHE"]=cachedir
os.environ["HF_DATASETS_CACHE"]=cachedir
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import (
    Trainer,
    TrainingArguments,
    default_data_collator,
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    AlbertForSequenceClassification,
    AlbertTokenizer,
    ElectraForSequenceClassification,
    ElectraTokenizer
)

# Define the custom dataset class
class HateSpeechDataset(Dataset):
    def __init__(self, csv_file, max_length, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = self.data['hatespeech']  # Assuming 'hatespeech' is the column for labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data['text'][idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True
        )
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        label = torch.tensor(self.labels[idx])  # Get the label for the corresponding text

        return input_ids, attention_mask, label

path =  f"./measuring_hate_speech.csv"
max_len = 128  # Define the maximum sequence length for BERT

hate_speech_dataset = HateSpeechDataset(csv_file=path, max_length=max_len)

batch_size = 32
shuffle = True  # Set to True if the data should be shuffled

In [None]:
tok_dict = {}

bertok = BertTokenizer.from_pretrained('bert-base-uncased')
distilbertok = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
robertok = RobertaTokenizer.from_pretrained('roberta-base')
albertok = AlbertTokenizer.from_pretrained('albert-base-v2')
electratok = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

tok_dict['bert'] = bertok
tok_dict['distilbert'] = distilbertok
tok_dict['roberta'] = robertok
tok_dict['albert'] = albertok
tok_dict['electra'] = electratok

In [None]:
# BERT Fine-Tuning

model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)
hate_speech_dataset = HateSpeechDataset(
    csv_file=path, 
    max_length=max_len, 
    tokenizer=tok_dict['bert']
)
dataloader = DataLoader(
    hate_speech_dataset, 
    batch_size=32, 
    shuffle=True, 
    collate_fn=default_data_collator,
)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,   
    warmup_steps=500,
    weight_decay=0.01,               
    logging_dir='./logs',            
)

trainer = Trainer(
    model=model, # type: ignore
    args=training_args,                
    train_dataset=hate_speech_dataset,
    data_collator=default_data_collator,
)

trainer.train()

# Save the model
model.save_pretrained('./model') # type: ignore
tok_dict['bert'].save_pretrained('./model')

# Load the model
model = BertForSequenceClassification.from_pretrained('./model')
tokenizer = BertTokenizer.from_pretrained('./model')

# Inference
text = "I hate you"
inputs = tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=max_len,
    padding='max_length',
    return_tensors='pt',
    return_attention_mask=True,
    return_token_type_ids=False,
    truncation=True
)
input_ids = inputs['input_ids'].squeeze(0)
attention_mask = inputs['attention_mask'].squeeze(0)
label = torch.tensor(1)  # Get the label for the corresponding text

outputs = model(input_ids, attention_mask=attention_mask, labels=label) # type: ignore
loss = outputs.loss
logits = outputs.logits
pred = torch.argmax(logits, dim=1)