#### In this notebook we are testing several LSTM models trained by us on different datasets from PAN: 'gemini-pro', 'gpt-4-turbo-preview', 'gpt-3.5-turbo-0125' 'mistralai-mixtral-8x7b-instruct-v0.1', 'all_vs_all_epoch_16'.
#### All these are tested against texts from M4 coming from different generators from different domains: arxiv, reddit, wikihow and wikipedia. For each test the human texts per domain are combined with the texts from each LLM dataset.

In [8]:
import pandas as pd
import numpy as np
import torch
import json
import os

from tqdm import tqdm
from torch import nn
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from timeit import default_timer as timer
from os import walk
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score, roc_curve, auc, brier_score_loss

In [9]:
ROOT_DATA = './'
M4_DATA_FOLDER_PATH = f'{ROOT_DATA}/raw/m4-unified'
BASELINE_MODELS_FOLDER_PATH = f'{ROOT_DATA}/baseline/models'

In [10]:
BATCH_SIZE = 32
LSTM_UNITS = 256
LSTM_LAYERS = 5
EMBEDDING_SIZE = 512

In [11]:
TEST_SET_FRACTION = 0.3

In [12]:
df = pd.DataFrame(columns=['text', 'is_llm', 'domain', 'dataset_name'])

In [13]:
dir_path, dir_names, file_names = next(walk(M4_DATA_FOLDER_PATH))

for dir in dir_names:
    dataset_folder_path, _, dataset_names = next(walk(os.path.join(dir_path, dir)))

    for dataset_name in dataset_names:
        temp_df = pd.read_json(path_or_buf=f'{dataset_folder_path}/{dataset_name}', lines=True)
        temp_df['domain'] = dir
        temp_df['dataset_name'] = Path(dataset_name).stem

        is_llm_dataset = 0 if 'human' in dataset_name else 1
        temp_df['is_llm'] = is_llm_dataset

        df = pd.concat([df, temp_df], ignore_index=True)

In [14]:
display(df)

Unnamed: 0,text,is_llm,domain,dataset_name
0,"In this paper, we investigate the continuum li...",1,arxiv,arxiv_chatGPT
1,"In this paper, we present the results of our a...",1,arxiv,arxiv_chatGPT
2,"In this work, we present the results of high-r...",1,arxiv,arxiv_chatGPT
3,"In this work, we present a new method of integ...",1,arxiv,arxiv_chatGPT
4,"In this study, we present the use of Atacama L...",1,arxiv,arxiv_chatGPT
...,...,...,...,...
74023,"Well, to be honest, it's kind of complicated h...",1,reddit,reddit_chatGPT
74024,The prevalence of draws in chess among the top...,1,reddit,reddit_chatGPT
74025,"So, you know how when you're driving with your...",1,reddit,reddit_chatGPT
74026,Nightmares are disturbing and distressing drea...,1,reddit,reddit_chatGPT


In [15]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [16]:
df['tokenized_text'] = tokenizer(list(df['text'].to_list()))['input_ids']

Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [18]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, layers_num, device, output_size=1, dropout=0):
        super().__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.layers_num = layers_num
        self.output_size= output_size
        self.dropout = dropout
        self.device = device

        self.embed = nn.Embedding(self.vocab_size, self.embedding_size, device=self.device)

        self.lstm = nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.hidden_size,
            num_layers=self.layers_num,
            batch_first=True,
            dropout=self.dropout,
            device=self.device
        )

        self.fc = nn.Linear(
            self.hidden_size,
            self.output_size
        )

    def forward(self, X, lengths):
        embeddings = self.embed(X)

        seq_output, (h_n, c_n) = self.lstm(embeddings)

        out = seq_output.sum(dim=1).div(lengths.float().unsqueeze(dim=1))
        logits = self.fc(out)
        return logits

In [19]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.size

    def __getitem__(self, index):
        return (
            self.X[index],
            self.y[index]
        )

In [20]:
def collate_fn(batch):
  # We want to sort the batch by seq length,
  # in order to make the computation more efficient
  batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)

  inputs = [torch.LongTensor(x[0]).to(device) for x in batch]
  padded_input = nn.utils.rnn.pad_sequence(inputs, batch_first=True)

  lengths = torch.LongTensor([len(x[0]) for x in batch]).to(device)

  y = torch.FloatTensor(np.array([x[1] for x in batch])).reshape(-1, 1).to(device)

  return padded_input, lengths, y

In [21]:
def calculate_accuracy(y_true, y_hat):
    correct_pred = torch.eq(torch.sigmoid(y_hat).round(), y_true).sum().item()
    return (correct_pred / len(y_hat)) * 100

def calculate_f1(y_true, y_hat):
    y_pred = torch.sigmoid(y_hat).round()
    return f1_score(y_true, y_pred)

def calculate_recall(y_true, y_hat):
    y_pred = torch.sigmoid(y_hat).round()
    return recall_score(y_true, y_pred)

def calculate_precision(y_true, y_hat):
    y_pred = torch.sigmoid(y_hat).round()
    return precision_score(y_true, y_pred)

def calculate_brier(y_true, y_hat):
    y_prob = torch.sigmoid(y_hat)
    return brier_score_loss(y_true, y_prob)

def calculate_auc(y_true, y_hat):
    y_prob = torch.sigmoid(y_hat)

    false_positive_rates, true_positive_rates, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(false_positive_rates, true_positive_rates)

    return roc_auc, false_positive_rates, true_positive_rates

In [22]:
def test_step(model, dataloader, loss_fn, device):

    model.eval()

    all_y_true = []
    all_y_hat = []

    test_loss = 0
    steps = 0

    with torch.inference_mode():
        for X, lengths, y in dataloader:

            X, y = X.to(device), y.to(device)

            y_hat = model(X, lengths)

            all_y_true.extend(y)
            all_y_hat.extend(y_hat)

            loss = loss_fn(y_hat, y)
            test_loss += loss.item()

            steps += 1

        all_y_true = torch.FloatTensor(all_y_true)
        all_y_hat = torch.FloatTensor(all_y_hat)

        test_accuracy = calculate_accuracy(all_y_true, all_y_hat)
        test_f1 = calculate_f1(all_y_true, all_y_hat)
        test_recall = calculate_recall(all_y_true, all_y_hat)
        test_precision = calculate_precision(all_y_true, all_y_hat)
        test_brier = calculate_brier(all_y_true, all_y_hat)
        test_auc_tuple = calculate_auc(all_y_true, all_y_hat)

    return test_loss / steps, test_accuracy, test_f1, test_recall, test_precision, test_brier, test_auc_tuple

def test_against_all(model, df, loss_fn, device):
    all_results = []

    for dataset_name in df['dataset_name'].unique():
        if 'human' in dataset_name:
            continue

        llm_df = df.loc[(df['dataset_name'] == dataset_name) & (df['is_llm'] == 1)]

        domain = llm_df['domain'].iloc[0]
        human_df = df.loc[(df['domain'] == domain) & (df['is_llm'] == 0)]

        test_df = pd.concat([llm_df, human_df], ignore_index=True)

        results_formatted = test(model, loss_fn, device, dataset_name, test_df)

        all_results.append({
            dataset_name: results_formatted
        })

    return all_results

def test(model, loss_fn, device, dataset_name, test_df):
    test_dataset_full = TextDataset(test_df['tokenized_text'], test_df['is_llm'])
    test_dataloader_full = DataLoader(
            test_dataset_full,
            batch_size=BATCH_SIZE,
            shuffle=False,
            drop_last=False,
            collate_fn=collate_fn
        )

    start_time = timer()

    test_loss, test_acc, test_f1, test_recall, test_precision, test_brier, test_auc_tuple = test_step(
            model,
            test_dataloader_full,
            loss_fn,
            device
        )

    end_time = timer()

    results_formatted = {
            "test_loss": test_loss,
            "test_acc": test_acc,
            "test_f1": test_f1,
            "test_recall": test_recall,
            "test_precision": test_precision,
            "test_brier": test_brier,
            "test_auc_tuple": test_auc_tuple
        }

    print(
            f"against: {dataset_name} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f} | "
            f"test_f1: {test_f1:.4f} | "
            f"test_recall: {test_recall:.4f} | "
            f"test_precision: {test_precision:.4f} | "
            f"test_brier: {test_brier:.4f} | "
            f"time: {(end_time-start_time):.4f}"
        )

    return results_formatted

In [None]:
human_df = df.loc[df['is_llm'] == 0]
human_train_df, human_test_df = train_test_split(human_df, test_size=TEST_SET_FRACTION, random_state=69)

In [23]:
models_to_test = [
    'gemini-pro',
    'gpt-4-turbo-preview',
    'gpt-3.5-turbo-0125',
    'mistralai-mixtral-8x7b-instruct-v0.1',
    '/all_vs_all/all_vs_all_epoch_16',
]

In [24]:
final_results = []

for model_name in models_to_test:
    model_path = f'{BASELINE_MODELS_FOLDER_PATH}/{model_name}.pt'

    model = RNN(tokenizer.vocab_size, EMBEDDING_SIZE, LSTM_UNITS, LSTM_LAYERS, device, dropout=0.6).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))

    loss_fn = nn.BCEWithLogitsLoss()

    print(f'Testing against M4 data for LSTM {model_name}...')

    results = test_against_all(
        model=model,
        df=df,
        loss_fn=loss_fn,
        device=device
    )

    final_results.append({
        'base_model': model_name,
        'results_against_all_llms': results
    })

    print(f'Finished testing against M4 data for LSTM {model_name}')


Testing against M4 data for LSTM gemini-pro...
against: arxiv_chatGPT | test_loss: 1.2772 | test_acc: 59.2833 | test_f1: 0.3801 | test_recall: 0.2497 | test_precision: 0.7960 | test_brier: 0.3465 | time: 8.5515
against: arxiv_bloomz | test_loss: 2.0658 | test_acc: 48.0667 | test_f1: 0.0465 | test_recall: 0.0253 | test_precision: 0.2836 | test_brier: 0.4755 | time: 8.1541
against: arxiv_flant5 | test_loss: 2.0157 | test_acc: 65.5667 | test_f1: 0.5215 | test_recall: 0.3753 | test_precision: 0.8543 | test_brier: 0.3308 | time: 6.5020
against: arxiv_davinci | test_loss: 2.0704 | test_acc: 50.2000 | test_f1: 0.1201 | test_recall: 0.0680 | test_precision: 0.5152 | test_brier: 0.4647 | time: 7.0294
against: arxiv_cohere | test_loss: 1.7320 | test_acc: 54.7833 | test_f1: 0.2610 | test_recall: 0.1597 | test_precision: 0.7139 | test_brier: 0.4076 | time: 8.0169
against: wikihow_cohere | test_loss: 2.6330 | test_acc: 51.9333 | test_f1: 0.0745 | test_recall: 0.0387 | test_precision: 1.0000 | test_

In [25]:
# Create a JSON Encoder class
class json_serialize(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

with open('./vs_m4.json', 'w', encoding='utf-8') as f:
    json.dump(final_results, f, ensure_ascii=False, indent=4, cls=json_serialize)