In [1]:
!pip install sentencepiece
!pip install datasets evaluate transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.27.4-

In [3]:
import pandas as pd


data = pd.read_csv("https://drive.google.com/uc?export=download&id=1jT9GZuC2CxjccaFXxh1neGAg9GGnfR5a")

In [4]:
from sklearn.model_selection import train_test_split


RANDOM_STATE = 420
TEST_SIZE = 0.2

train_df, val_df = train_test_split(data, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [5]:
print(train_df.columns)
print(len(train_df.columns))

Index(['body', 'hate', 'privacy', 'sexual', 'impersonation', 'illegal',
       'advertisement', 'ai', 'neutral'],
      dtype='object')
9


In [6]:
print(f'Data: {len(data)}, Train: {len(train_df)}, Val: {len(val_df)}')

Data: 49098, Train: 39278, Val: 9820


In [7]:
for col in ["hate", "privacy", "sexual", "impersonation", "illegal", "advertisement", "ai", "neutral"]:
  print(f'{col}: {data[col].sum()}')

hate: 19581
privacy: 257
sexual: 468
impersonation: 257
illegal: 274
advertisement: 473
ai: 6043
neutral: 21753


In [8]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig, AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MultilabelDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["body"]
        labels = self.data.iloc[idx][["hate", "privacy", "sexual", "impersonation", "illegal", "advertisement", "ai", "neutral"]].values.astype(float)

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )

        return {
            "input_ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.float)
        }

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

train_dataset = MultilabelDataset(train_df, tokenizer)
val_dataset = MultilabelDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

config = RobertaConfig.from_pretrained("roberta-base", num_labels=8)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config=config).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 50)
criterion = nn.BCEWithLogitsLoss()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
patience, early_stop_counter = 3, 0
min_val_loss = float("inf")

for epoch in range(50):
    model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            preds = torch.sigmoid(logits).round()
            val_loss += loss.item()
            val_correct += (preds == labels).sum().item()
            val_total += labels.numel()

    val_loss /= len(val_loader)
    val_acc = val_correct / val_total

    print(f"Epoch {epoch + 1}/{50}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    if val_loss < min_val_loss:
        min_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), "roberta.pt")
    else:
        early_stop_counter += 1

    if early_stop_counter >= patience:
        print("Early stopping")
        break


model.load_state_dict(torch.load("roberta.pt"))

In [9]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig, AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = RobertaConfig.from_pretrained("roberta-base", num_labels=8)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config=config).to(device)
model.load_state_dict(torch.load("roberta.pt"))

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score
import numpy as np

model.eval()
test_loss, test_correct, test_total = 0, 0, 0
all_preds, all_labels = [], []

criterion = nn.BCEWithLogitsLoss()

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        preds = torch.sigmoid(logits).round()

        all_preds.extend(preds.cpu().numpy().tolist())
        all_labels.extend(labels.cpu().numpy().tolist())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

print("Classification Report for RoBERTa:\n", classification_report(all_labels, all_preds, target_names=["hate", "privacy", "sexual", "impersonation", "illegal", "advertisement", "ai", "neutral"], zero_division=0))

acc = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {acc:.2f}')

f1 = f1_score(all_labels, all_preds, average="weighted", zero_division=0)
print(f'F1 score: {f1:.2f}')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Classification Report for RoBERTa:
                precision    recall  f1-score   support

         hate       0.93      0.95      0.94      3895
      privacy       0.62      0.61      0.62        49
       sexual       0.62      0.39      0.48        84
impersonation       0.56      0.59      0.57        51
      illegal       0.78      0.18      0.29        39
advertisement       0.98      0.44      0.61        95
           ai       0.96      0.98      0.97      1217
      neutral       0.95      0.92      0.93      4394

    micro avg       0.94      0.93      0.93      9824
    macro avg       0.80      0.63      0.68      9824
 weighted avg       0.93      0.93      0.93      9824
  samples avg       0.93      0.93      0.93      9824

Accuracy: 0.92
F1 score: 0.93
