In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from tqdm import tqdm

# Data
We will use the Hybrid subset of Vijini et al. in which some sentences of human-written abstracts where replaced by automatically-generated text. Experiments on the fully-generated subsets (or any other dataset) may provide bonus points.

There are no train-test split provided in the paper but we keep 80% to train and 20% to test, following Vijini et al.

In [None]:
# !curl -L -o main.zip https://github.com/vijini/GeneratedTextDetection/archive/refs/heads/main.zip
# !tar -xf main.zip

In [None]:
from pathlib import Path

In [None]:
root = Path("GeneratedTextDetection-main/Dataset/Hybrid_AbstractDataset")

In [None]:
train_texts, train_labels, test_texts, test_labels = [], [], [], []
for path in root.glob("*.txt"):
    with open(path, 'rt') as file:
        text = file.read()
    label = int(path.name.endswith("generatedAbstract.txt"))
    doc_id = int(path.name.split("_")[0].split(".")[-1])
    if doc_id < 10522:
        test_texts.append(text)
        test_labels.append(label)
    else:
        train_texts.append(text)
        train_labels.append(label)

In [None]:
len(train_texts), len(train_labels), len(test_texts), len(test_labels)

In [None]:
train_texts[0]

In [None]:
train_labels[0]

In [None]:
train_texts[10]

In [None]:
train_labels[10]

In [None]:
class TextDetectionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_model(model, train_loader, val_loader, device, epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    best_accuracy = 0
    
    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        
        # Training phase
        model.train()
        train_losses = []
        train_preds = []
        train_labels = []
        
        for batch in tqdm(train_loader, desc='Training'):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            loss.backward()
            optimizer.step()
            
            train_losses.append(loss.item())
            
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            train_preds.extend(preds)
            train_labels.extend(labels.cpu().numpy())
        
        train_accuracy = np.mean(np.array(train_preds) == np.array(train_labels))
        
        # Validation phase
        model.eval()
        val_losses = []
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                logits = outputs.logits
                
                val_losses.append(loss.item())
                
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_labels.extend(labels.cpu().numpy())
        
        val_accuracy = np.mean(np.array(val_preds) == np.array(val_labels))
        
        print(f'\nTraining Loss: {np.mean(train_losses):.4f}')
        print(f'Training Accuracy: {train_accuracy:.4f}')
        print(f'Validation Loss: {np.mean(val_losses):.4f}')
        print(f'Validation Accuracy: {val_accuracy:.4f}')
        
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            # Save the best model if needed
            # torch.save(model.state_dict(), 'best_model.pth')

def evaluate_model(model, test_loader, device):
    model.eval()
    test_preds = []
    test_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Testing'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            test_preds.extend(preds)
            test_labels.extend(labels.cpu().numpy())
    
    print("\nClassification Report:")
    print(classification_report(test_labels, test_preds))
    print("\nConfusion Matrix:")
    print(confusion_matrix(test_labels, test_preds))

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=2
    ).to(device)
    
    # Create datasets
    train_dataset = TextDetectionDataset(train_texts, train_labels, tokenizer)
    test_dataset = TextDetectionDataset(test_texts, test_labels, tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8)
    
    # Train the model
    train_model(model, train_loader, test_loader, device)
    
    # Evaluate the model
    evaluate_model(model, test_loader, device)

if __name__ == "__main__":
    main()