<a href="https://colab.research.google.com/github/chitlchow/ai-text-detection-bert/blob/main/Sequence_Classification_with_BERT_RoBERTa_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Modules Import
!pip install datasets transformers
!pip install kaggle torchmetrics
from datasets import load_dataset, load_dataset_builder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pickle

import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import BertTokenizer, RobertaTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import string
import pandas as pd

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:

# Model Choice

In [2]:
from transformers import BertModel, RobertaModel
from torch import nn

model_name = 'bert-base-uncased' # @param ['bert-base-case', 'bert-base-uncased', 'roberta-base'] {'type': 'string'}

class TransformerTextClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.transformer = BertModel.from_pretrained(model_name) # @param ['BertModel.from_pretrained(model_name)', 'RobertaModel.from_pretrained(model_name)']  {'type': 'raw'}
        self.classifier = nn.Sequential(
            nn.Linear(768, 2),
            nn.ReLU()
        )
    def forward(self, input_ids, attention_mask):
        x = self.transformer(input_ids, attention_mask).pooler_output
        output = self.classifier(x)
        return output


# Data Preparation and Tokenizer Settings

In [3]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import BertTokenizer, RobertaTokenizer
ds_name = 'aadityaubhat/GPT-wiki-intro' # @param {'type':'string'}
import random
class TextDataset(Dataset):
    def __init__(self, ds_name:
                 str=ds_name, model_name: str=model_name) -> None:
        self.dataset = load_dataset(ds_name,split='train')
        self.tokenizer = BertTokenizer.from_pretrained(model_name) # @param ['BertTokenizer.from_pretrained(model_name)', 'RobertaTokenizer.from_pretrained(model_name)'] {'type':'raw'}
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        json_data = self.dataset[idx]
        source = random.choice(['wiki_intro', 'generated_intro'])
        text = json_data[source]
        label = 0 if source == 'wiki_intro' else 1
        tokens = self.tokenizer(
            text, max_length=512, padding='max_length', truncation=True, return_tensors='pt'
            )
        input_ids = tokens['input_ids']
        attention_mask = tokens['attention_mask']
        # Return token, label
        return (input_ids, attention_mask), label

train_dataset = TextDataset()

README.md:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

GPT-wiki-intro.csv.zip:   0%|          | 0.00/127M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



# Model Training

In [6]:
# @title Training Parameters
from torchmetrics import Accuracy, Precision, Recall, F1Score
from torch import optim

# Model to CUDA
model = TransformerTextClassifier()

# Initialize bias and weight
weight_init_method = 'zero' # @param ['zero', 'normal', 'None'] {'type':'string'}
if weight_init_method == 'zero':
    nn.init.zeros_(model.transformer.pooler.dense.bias)
    nn.init.zeros_(model.transformer.pooler.dense.weight)
elif weight_init_method == 'normal':
    nn.init.normal_(model.transformer.pooler.dense.bias)
    nn.init.normal_(model.transformer.pooler.dense.weight)
else:
    pass

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training parameters
num_epochs = 3 # @param {"type":"number"}
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW( # @param ["optim.AdamW("] {"type":"raw"}
    model.parameters(),
lr = 2e-5 # @param {"type":"number"}
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32 # @param ['4', '8', '12', '16', '32']{'type': 'raw'}
    ,shuffle=True
)
print(f'Training Steps: {len(train_dataloader)}')

# Setup metrics
acc = Accuracy(task='multiclass', num_classes=2, average='micro').to(device)
precision = Precision(task='multiclass', num_classes=2, average='micro').to(device)
f1 = F1Score(task='multiclass', num_classes=2, average='micro').to(device)
recall = Recall(task='multiclass', num_classes=2, average='micro').to(device)

for epoch in range(num_epochs):
    # Reseting running loss
    running_loss = 0
    # Model to training mode
    model.train()
    for (input_ids, attention_masks), labels in train_dataloader:
        optimizer.zero_grad()
        # Produce output from model
        input_ids, attention_masks = input_ids.squeeze().to(device), attention_masks.squeeze().to(device)
        output = model(input_ids, attention_masks)
        labels = labels.to(device)

        # Compute Metrics
        loss = criterion(output, labels)
        acc(output, labels)
        precision(output, labels)
        f1(output, labels)
        recall(output, labels)
        running_loss += loss.item()

        # Back propagation
        loss.backward()
        optimizer.step()

    # Compute Overall all metrics
    epoch_acc = acc.compute()
    epoch_precision = precision.compute()
    epoch_f1 = f1.compute()
    epoch_recall = recall.compute()
    print(
        f'Epoch: {epoch+1}, Loss: {running_loss:.4f}, Accuracy: {epoch_acc:.4f}, Precision: {epoch_precision:.4f}, F1: {epoch_f1:.4f}, Recall: {epoch_recall:.4f}'
        )

    # Reset all metrics for monitoring
    acc.reset()
    precision.reset()
    f1.reset()
    recall.reset()

print('Training Complete')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Steps: 4688
Epoch: 1, Loss: 1033.4609, Accuracy: 0.6955, Precision: 0.6955, F1: 0.6955, Recall: 0.6955
Epoch: 2, Loss: 24.1580, Accuracy: 0.9986, Precision: 0.9986, F1: 0.9986, Recall: 0.9986
Epoch: 3, Loss: 13.6294, Accuracy: 0.9992, Precision: 0.9992, F1: 0.9992, Recall: 0.9992
