In [None]:
pip install torch torchvision torchaudio transformers datasets scikit-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

#load the dataset
df = pd.read_csv('path/to/your/dataset.csv')

#split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

#load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

#tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

#tokenize training and testing data
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True)

#convert to torch dataset
import torch

class RacismDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RacismDataset(train_encodings, list(train_df['label']))
test_dataset = RacismDataset(test_encodings, list(test_df['label']))


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

trainer.evaluate()

In [None]:
def predict(texts):
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encodings)
        predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions

#example prediction
new_comments = ["racist comment example", "neutral comment example"]
predictions = predict(new_comments)
print(predictions)

In [None]:
#save the models to preferred path
model.save_pretrained('your path here')
tokenizer.save_pretrained('your path here')