In [39]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader

# Load tokenizer and pre-trained RoBERTa model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)  # Adjust num_labels as needed

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
import pandas as pd

# Load CSV
df = pd.read_csv("cyberbullying.csv")

# Peek at the data
print(df.head())

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [31]:
df['cyberbullying_type']

0        not_cyberbullying
1        not_cyberbullying
2        not_cyberbullying
3        not_cyberbullying
4        not_cyberbullying
               ...        
47687            ethnicity
47688            ethnicity
47689            ethnicity
47690            ethnicity
47691            ethnicity
Name: cyberbullying_type, Length: 47692, dtype: object

In [None]:
label_map = {
    'not_cyberbullying': 0,
    'other_cyberbullying': 1,
    'age': 2,
    'religion': 3,
    'gender': 4,
    'ethnicity': 5
}

# Clean string values first
df['cyberbullying_type'] = df['cyberbullying_type'].str.lower().str.strip()

# Then map them to integers
df['cyberbullying_type'] = df['cyberbullying_type'].map(label_map)

None


In [34]:
print(df['cyberbullying_type'].unique())

[0 4 3 1 2 5]


In [35]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['tweet_text'].tolist(),
    df['cyberbullying_type'].tolist(),
    test_size=0.2,
    random_state=42
)

# Tokenize
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [36]:
class BullyingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BullyingDataset(train_encodings, train_labels)
val_dataset = BullyingDataset(val_encodings, val_labels)

In [37]:
# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
train_labels = [int(label) for label in train_labels]
val_labels = [int(label) for label in val_labels]

In [44]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss


# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()

# Train the model
epochs = 3  # Set the number of epochs you want to train
device = torch.device('cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}: Loss = {loss.item()}")

# Optionally, evaluate the model on the validation set

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [45]:
print(df['cyberbullying_type'].unique())
print(df['cyberbullying_type'].isna().sum())
print(df['cyberbullying_type'].dtype)

[0 4 3 1 2 5]
0
int64


In [47]:
print("Unique labels in full dataset:", df['cyberbullying_type'].unique())
print("Any NaNs?", df['cyberbullying_type'].isna().sum())
print("Data type:", df['cyberbullying_type'].dtype)

invalid_labels = df[~df['cyberbullying_type'].between(0, 5)]
print("Invalid label rows:\n", invalid_labels)

Unique labels in full dataset: [0 4 3 1 2 5]
Any NaNs? 0
Data type: int64
Invalid label rows:
 Empty DataFrame
Columns: [tweet_text, cyberbullying_type]
Index: []


In [48]:
for batch in train_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    print("Logits shape:", outputs.logits.shape)
    print("Labels shape:", labels.shape)
    break

Logits shape: torch.Size([16, 6])
Labels shape: torch.Size([16])
