In this notebook we are fine-tuning DistilBERT models on a single text source from M4 out of Arxiv, Reddit, Wikihow and Wikipedia. We then test each model's performance on the rest of the text sources. Each text source dataset contains 5 subsets of LLM generated text and 1 subset of human-written text.

In [1]:
!pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m174.1/179.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [2]:
import pandas as pd
import numpy as np
import torch
import json
import os
import torch.nn.functional as F
from torcheval.metrics.functional import binary_f1_score
from google.colab import files

from tqdm import tqdm
from torch import nn
from transformers import DistilBertForSequenceClassification, AdamW, DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler
from timeit import default_timer as timer
from os import walk
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_curve, auc, brier_score_loss

In [3]:
ROOT_DATA = './'
M4_DATA_FOLDER_PATH = f'{ROOT_DATA}/m4-unified'

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')
!unzip /content/gdrive/MyDrive/uni/iioz/llm-detect/m4-unified.zip

Mounted at /content/gdrive
Archive:  /content/gdrive/MyDrive/uni/iioz/llm-detect/m4-unified.zip
   creating: m4-unified/arxiv/
  inflating: m4-unified/arxiv/arxiv_bloomz.jsonl  
  inflating: m4-unified/arxiv/arxiv_chatGPT.jsonl  
  inflating: m4-unified/arxiv/arxiv_cohere.jsonl  
  inflating: m4-unified/arxiv/arxiv_davinci.jsonl  
  inflating: m4-unified/arxiv/arxiv_flant5.jsonl  
  inflating: m4-unified/arxiv/arxiv_human.jsonl  
   creating: m4-unified/reddit/
  inflating: m4-unified/reddit/reddit_bloomz.jsonl  
  inflating: m4-unified/reddit/reddit_chatGPT.jsonl  
  inflating: m4-unified/reddit/reddit_cohere.jsonl  
  inflating: m4-unified/reddit/reddit_davinci.jsonl  
  inflating: m4-unified/reddit/reddit_dolly.jsonl  
  inflating: m4-unified/reddit/reddit_flant5.jsonl  
  inflating: m4-unified/reddit/reddit_human.jsonl  
   creating: m4-unified/wikihow/
  inflating: m4-unified/wikihow/wikihow_bloomz.jsonl  
  inflating: m4-unified/wikihow/wikihow_chatGPT.jsonl  
  inflating: m4-uni

In [5]:
TEST_SET_FRACTION = 0.3
BATCH_SIZE = 32
LLMS = ['bloomz', 'chatgpt', 'cohere', 'davinci', 'human']

In [6]:
df = pd.DataFrame(columns=['text', 'is_llm', 'domain', 'dataset_name'])

In [7]:
dir_path, dir_names, file_names = next(walk(M4_DATA_FOLDER_PATH))

for dir in dir_names:
    dataset_folder_path, _, dataset_names = next(walk(os.path.join(dir_path, dir)))
    for dataset_name in dataset_names:
        temp_df = pd.read_json(path_or_buf=f'{dataset_folder_path}/{dataset_name}', lines=True)
        temp_df['domain'] = dir
        temp_df['dataset_name'] = Path(dataset_name).stem
        if Path(dataset_name).stem.split('_')[1].lower() not in LLMS:
          continue
        temp_df['is_llm'] = 0 if 'human' in dataset_name else 1
        print(dataset_name, 0 if 'human' in dataset_name else 1 )
        df = pd.concat([df, temp_df], ignore_index=True)

wikihow_cohere.jsonl 1
wikihow_davinci.jsonl 1
wikihow_human.jsonl 0
wikihow_bloomz.jsonl 1
wikihow_chatGPT.jsonl 1
reddit_bloomz.jsonl 1
reddit_chatGPT.jsonl 1
reddit_cohere.jsonl 1
reddit_davinci.jsonl 1
reddit_human.jsonl 0
wikipedia_cohere.jsonl 1
wikipedia_human.jsonl 0
wikipedia_bloomz.jsonl 1
wikipedia_davinci.jsonl 1
arxiv_cohere.jsonl 1
arxiv_bloomz.jsonl 1
arxiv_chatGPT.jsonl 1
arxiv_davinci.jsonl 1
arxiv_human.jsonl 0


In [8]:
df[df["is_llm"] == 1]

Unnamed: 0,text,is_llm,domain,dataset_name
0,\n\nHow to Play Forza Motorsport\nThis wikihow...,1,wikihow,wikihow_cohere
1,\n\nHow to Buy Virtual Console Games for Ninte...,1,wikihow,wikihow_cohere
2,\n\nInstalling Windows NT 4.0 Workstation\n\nW...,1,wikihow,wikihow_cohere
3,\n\nHow to Make Perfume\n\nMaking your own per...,1,wikihow,wikihow_cohere
4,\n\nHow to Convert Song Lyrics to a Song\nTher...,1,wikihow,wikihow_cohere
...,...,...,...,...
53326,"\nA Suzaku observation of Centaurus A, a radio...",1,arxiv,arxiv_davinci
53327,"\nIn this article, significant alignment and s...",1,arxiv,arxiv_davinci
53328,We have conducted an extensive spectral analy...,1,arxiv,arxiv_davinci
53329,\nThis paper reviews the theory surrounding th...,1,arxiv,arxiv_davinci


In [9]:
df = df.astype({'is_llm': 'int64'})

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [12]:
def tokenize_texts(df, tokenizer):
    input_ids = []
    attention_masks = []
    for text in df.text.values:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 512,           # Pad & truncate all sentences.
                            padding='max_length',
                            truncation=True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.from_numpy(df['is_llm'].values)

    return input_ids, attention_masks, labels

In [13]:
def get_tensor_dataset_from_df(df, tokenizer):
    input_ids, attention_masks, labels = tokenize_texts(df, tokenizer)
    #input_ids = torch.flatten(input_ids)
    #attention_masks = torch.flatten(attention_mask)
    print(input_ids.shape, attention_masks.shape, labels.shape)

    return TensorDataset(input_ids, attention_masks, labels)

In [14]:
def train_step(model, dataloader, loss_fn, optimizer, device):

    model.train()
    train_loss = 0
    steps = 0

    for batch in dataloader:
        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        y_hat = model(b_input_ids,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = y_hat.loss
        train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        steps += 1

    # Calculate the average loss over all of the batches.
    return train_loss / steps

In [15]:
def test_step(model, dataloader, device):

    model.eval()

    all_y_true = []
    all_y_hat = []

    test_loss = 0
    steps = 0

    with torch.inference_mode():
        for batch in dataloader:
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            y_hat = model(batch_input_ids,
                          attention_mask=batch_input_mask,
                          labels=batch_labels)

            loss = y_hat.loss
            test_loss += loss.item()

            batch_labels = batch_labels.to('cpu').tolist()
            logits = y_hat.logits.detach().cpu().numpy()
            y_hat = np.argmax(logits, axis=1).flatten().tolist()

            all_y_true.extend(batch_labels)
            all_y_hat.extend(y_hat)

            steps += 1

        #test_f1 = binary_f1_score(all_y_true, all_y_hat)

    return all_y_true, all_y_hat, test_loss / steps

In [None]:
def train(model,
          train_dataloader,
          test_dataloader,
          optimizer,
          loss_fn,
          epochs,
          device):

    results = {
        "y_true": [],
        "y_hat": [],
    }

    model.to(device)

    y, y_hat = None, None

    for epoch in tqdm(range(epochs)):

        start_time = timer()
        train_loss = train_step(
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            device=device,
        )
        end_time = timer()

        y, y_hat, test_loss = test_step(
            model=model,
            dataloader=test_dataloader,
            device=device,
        )
        #test_f1 = binary_f1_score(y, y_hat)

        #results["y_true"].append(y)
        #results["y_hat"].append(y_hat)

        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"time: {(end_time-start_time):.4f}"
        )

    #torch.save(model.state_dict(), './model.pt')
    #from google.colab import files
    #files.download('./model.pt')
    return model, y, y_hat

In [17]:
def train_and_test_against_all(column, value):
  results = {}

  curr_df = df.loc[df[column] == value]
  train_df, test_df = train_test_split(curr_df, test_size=TEST_SET_FRACTION)
  train_dataset = get_tensor_dataset_from_df(train_df, tokenizer)
  test_dataset = get_tensor_dataset_from_df(test_df, tokenizer)

  train_dataloader = DataLoader(
      train_dataset,
      batch_size=BATCH_SIZE,
      shuffle=True,
      drop_last=True,
  )
  test_dataloader = DataLoader(
      test_dataset,
      batch_size=BATCH_SIZE,
      shuffle=False,
      drop_last=True,
  )
  model = DistilBertForSequenceClassification.from_pretrained(
          "distilbert-base-cased",
          num_labels = 2,
          output_attentions = False, # Whether the model returns attentions weights.
          output_hidden_states = False, # Whether the model returns all hidden-states.
      )

  if device == "cuda:0":
    model = model.cuda()

  model = model.to(device)
  optimizer = AdamW(model.parameters(),
                    lr =5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )

  loss_fn = torch.nn.BCEWithLogitsLoss()

  model, y, y_hat = train(model, train_dataloader, test_dataloader, optimizer, loss_fn, 1, device)

  results['self'] = {'y': y, 'y_hat': y_hat}

  for col_value in df[column].unique():
    if col_value == value:
      continue

    curr_df = df[df[column] == col_value]

    test_dataset = get_tensor_dataset_from_df(curr_df, tokenizer)

    dataloader = DataLoader(
      test_dataset,
      batch_size=BATCH_SIZE,
      shuffle=False,
      drop_last=True,
    )

    y, y_hat, _ = test_step(model, dataloader, device)
    results[f"{column}-{col_value}"] = {'y': y, 'y_hat': y_hat}

  return results

In [18]:
# Create a JSON Encoder class
class json_serialize(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


In [19]:
#t = torch.tensor([5,-5])
#t
#x = torch.argmax(t)
#x.item()

In [None]:
all_results = []
file_names = []

for domain in df['domain'].unique():
  results = train_and_test_against_all('domain', domain)
  all_results.append(results)
  file_name = f"./domain-{domain}.json"
  with open(file_name, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4, cls=json_serialize)
  file_names.extend(file_name)

torch.Size([10500, 512]) torch.Size([10500, 512]) torch.Size([10500])
torch.Size([4500, 512]) torch.Size([4500, 512]) torch.Size([4500])


model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [08:49<00:00, 529.28s/it]

Epoch: 1 | train_loss: 0.0563 | test_loss: 0.0206 | time: 459.2743





torch.Size([15000, 512]) torch.Size([15000, 512]) torch.Size([15000])
torch.Size([11331, 512]) torch.Size([11331, 512]) torch.Size([11331])
torch.Size([15000, 512]) torch.Size([15000, 512]) torch.Size([15000])
torch.Size([10500, 512]) torch.Size([10500, 512]) torch.Size([10500])
torch.Size([4500, 512]) torch.Size([4500, 512]) torch.Size([4500])


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [08:46<00:00, 526.10s/it]

Epoch: 1 | train_loss: 0.1031 | test_loss: 0.0340 | time: 456.4229





torch.Size([15000, 512]) torch.Size([15000, 512]) torch.Size([15000])
torch.Size([11331, 512]) torch.Size([11331, 512]) torch.Size([11331])
torch.Size([15000, 512]) torch.Size([15000, 512]) torch.Size([15000])
torch.Size([7931, 512]) torch.Size([7931, 512]) torch.Size([7931])
torch.Size([3400, 512]) torch.Size([3400, 512]) torch.Size([3400])


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [06:38<00:00, 398.54s/it]

Epoch: 1 | train_loss: 0.1664 | test_loss: 0.1147 | time: 345.5041





torch.Size([15000, 512]) torch.Size([15000, 512]) torch.Size([15000])
torch.Size([15000, 512]) torch.Size([15000, 512]) torch.Size([15000])


In [None]:
for file_name in file_names:
  files.download(file_name)