In [1]:
!pip install sentencepiece
!pip install datasets evaluate transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.27.4-p

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/fixjs/transfer

Mounted at /content/drive
/content/drive/MyDrive/fixjs/transfer


In [3]:
import pandas as pd


data = pd.read_csv("https://drive.google.com/uc?export=download&id=19nDp1WqHLUshPP3Gl4VXf_VNHUhnziHJ")

In [4]:
from sklearn.model_selection import train_test_split


RANDOM_STATE = 420
TEST_SIZE = 0.2

train_df, test_df = train_test_split(data, test_size=TEST_SIZE, random_state=RANDOM_STATE)
train_df, val_df = train_test_split(train_df, test_size=0.05, random_state=RANDOM_STATE)

In [5]:
print(f'Data: {len(data)}, Train: {len(train_df)}, Test: {len(test_df)}, Val: {len(val_df)}')

Data: 41203, Train: 31313, Test: 8241, Val: 1649


In [6]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MultilabelDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["body"]
        labels = self.data.iloc[idx][["hate", "privacy", "sexual", "impersonation", "illegal", "advertisement", "ai"]].values.astype(float)

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )

        return {
            "input_ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.float)
        }

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = MultilabelDataset(train_df, tokenizer)
val_dataset = MultilabelDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

config = BertConfig.from_pretrained("bert-base-uncased", num_labels=7)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 50)
criterion = nn.BCEWithLogitsLoss()

patience, early_stop_counter = 2, 0
min_val_loss = float("inf")

for epoch in range(50):
    model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            preds = torch.sigmoid(logits).round()
            val_loss += loss.item()
            val_correct += (preds == labels).sum().item()
            val_total += labels.numel()

    val_loss /= len(val_loader)
    val_acc = val_correct / val_total

    print(f"Epoch {epoch + 1}/{50}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    if val_loss < min_val_loss:
        min_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), "bert.pt")
    else:
        early_stop_counter += 1

    if early_stop_counter >= patience:
        print("Early stopping")
        break

model.load_state_dict(torch.load("bert.pt"))

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/50, Train Loss: 0.0517, Val Loss: 0.0266, Val Acc: 0.9894
Epoch 2/50, Train Loss: 0.0230, Val Loss: 0.0215, Val Acc: 0.9928
Epoch 3/50, Train Loss: 0.0183, Val Loss: 0.0276, Val Acc: 0.9913
Epoch 4/50, Train Loss: 0.0138, Val Loss: 0.0262, Val Acc: 0.9916
Early stopping


<All keys matched successfully>

In [9]:
test_dataset = MultilabelDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score
import numpy as np

model.eval()
test_loss, test_correct, test_total = 0, 0, 0
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        preds = torch.sigmoid(logits).round()
        
        test_loss += loss.item()
        test_correct += (preds == labels).sum().item()
        test_total += labels.numel()

        all_preds.extend(preds.cpu().numpy().tolist())
        all_labels.extend(labels.cpu().numpy().tolist())

test_loss /= len(test_loader)
test_acc = test_correct / test_total

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

print("Classification Report for BERT:\n", classification_report(all_labels, all_preds, target_names=["hate", "privacy", "sexual", "impersonation", "illegal", "advertisement", "ai"]))

acc = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {acc:.2f}')

f1 = f1_score(all_labels, all_preds, average="weighted")
print(f'F1 score: {f1:.2f}')

Classification Report for BERT:
                precision    recall  f1-score   support

         hate       0.92      0.95      0.94      3789
      privacy       0.00      0.00      0.00         0
       sexual       0.00      0.00      0.00         3
impersonation       0.00      0.00      0.00         0
      illegal       0.00      0.00      0.00         3
advertisement       0.00      0.00      0.00         6
           ai       0.93      0.94      0.94       234

    micro avg       0.92      0.95      0.94      4035
    macro avg       0.27      0.27      0.27      4035
 weighted avg       0.92      0.95      0.93      4035
  samples avg       0.46      0.46      0.46      4035

Accuracy: 0.94
F1 score: 0.93


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
