## Imports

In [20]:
import pandas as pd
import numpy as np
import torch
import json
import os

from tqdm import tqdm
from torch import nn
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from timeit import default_timer as timer
from os import walk
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_curve, auc, brier_score_loss

## Data

In [21]:
ROOT_DATA = './'
ROOT_DATA_RAW = f'{ROOT_DATA}'
HUMAN_JSON_FILE_NAME = 'human.jsonl'
HUMAN_JSON_PATH = f'{ROOT_DATA_RAW}/{HUMAN_JSON_FILE_NAME}'
MODELS_JSON_FOLDER_PATH = f'{ROOT_DATA_RAW}/machines'

In [22]:
BATCH_SIZE = 32
LSTM_UNITS = 256
LSTM_LAYERS = 5
EMBEDDING_SIZE = 512

In [23]:
TEST_SET_FRACTION = 0.3

In [24]:
df = pd.read_json(path_or_buf=HUMAN_JSON_PATH, lines=True)
df['text_index'] = df.index
df['is_llm'] = 0
df['dataset_name'] = Path(HUMAN_JSON_FILE_NAME).stem

In [25]:
dir_path, dir_names, file_names = next(walk(MODELS_JSON_FOLDER_PATH))

for file_name in file_names:
    temp_df = pd.read_json(path_or_buf=f'{MODELS_JSON_FOLDER_PATH}/{file_name}', lines=True)
    temp_df['text_index'] = temp_df.index
    temp_df['is_llm'] = 1
    temp_df['dataset_name'] = Path(file_name).stem

    df = pd.concat([df, temp_df], ignore_index=True)

df.drop(labels=['id'], inplace=True, axis='columns')

In [26]:
df.tail()

Unnamed: 0,text,text_index,is_llm,dataset_name
15213,Gabby Petito's Disappearance: How Social Media...,1082,1,meta-llama-llama-2-70b-chat-hf
15214,"MISSING: Gabby Petito, 22, Last Seen Traveling...",1083,1,meta-llama-llama-2-70b-chat-hf
15215,UW Oshkosh Student Claims Giving Ride to Brian...,1084,1,meta-llama-llama-2-70b-chat-hf
15216,The Gabby Petito Case: How Social Media Shaped...,1085,1,meta-llama-llama-2-70b-chat-hf
15217,Gabby Petito Remembered as 'Super Kind-Hearted...,1086,1,meta-llama-llama-2-70b-chat-hf


### Tokenize

In [27]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [28]:
df['tokenized_text'] = tokenizer(list(df['text'].to_list()))['input_ids']

Token indices sequence length is longer than the specified maximum sequence length for this model (843 > 512). Running this sequence through the model will result in indexing errors


## Model

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [30]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, layers_num, device, output_size=1, dropout=0):
        super().__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.layers_num = layers_num
        self.output_size= output_size
        self.dropout = dropout
        self.device = device

        self.embed = nn.Embedding(self.vocab_size, self.embedding_size, device=self.device)

        self.lstm = nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.hidden_size,
            num_layers=self.layers_num,
            batch_first=True,
            dropout=self.dropout,
            device=self.device
        )

        self.fc = nn.Linear(
            self.hidden_size,
            self.output_size
        )

    def forward(self, X, lengths):
        embeddings = self.embed(X)

        seq_output, (h_n, c_n) = self.lstm(embeddings)

        out = seq_output.sum(dim=1).div(lengths.float().unsqueeze(dim=1))
        logits = self.fc(out)
        return logits

## Dataset

In [31]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.size

    def __getitem__(self, index):
        return (
            self.X.to_numpy()[index],
            self.y.to_numpy()[index]
        )

In [32]:
def collate_fn(batch):
  # We want to sort the batch by seq length,
  # in order to make the computation more efficient
  batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)

  inputs = [torch.LongTensor(x[0]).to(device) for x in batch]
  padded_input = nn.utils.rnn.pad_sequence(inputs, batch_first=True)

  lengths = torch.LongTensor([len(x[0]) for x in batch]).to(device)

  y = torch.FloatTensor(np.array([x[1] for x in batch])).reshape(-1, 1).to(device)

  return padded_input, lengths, y

## Data loaders

In [33]:
X, y = df['tokenized_text'], df['is_llm']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SET_FRACTION, random_state=69, stratify=y)

In [35]:
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, collate_fn=collate_fn)

## Train and test functions

In [36]:
def calculate_accuracy(y_true, y_hat):
    correct_pred = torch.eq(torch.sigmoid(y_hat).round(), y_true).sum().item()
    return (correct_pred / len(y_hat)) * 100

def calculate_f1(y_true, y_hat):
    y_pred = torch.sigmoid(y_hat).round()
    return f1_score(y_true, y_pred)

def calculate_brier(y_true, y_hat):
    y_prob = torch.sigmoid(y_hat)
    return brier_score_loss(y_true, y_prob)

def calculate_auc(y_true, y_hat):
    y_prob = torch.sigmoid(y_hat)

    false_positive_rates, true_positive_rates, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(false_positive_rates, true_positive_rates)

    return roc_auc, false_positive_rates, true_positive_rates

def train_step(model, dataloader, loss_fn, optimizer, device):

    model.train()

    train_loss, train_acc = 0, 0
    steps = 0

    for X, lengths, y in dataloader:
        X, y = X.to(device), y.to(device)

        y_hat = model(X, lengths)

        loss = loss_fn(y_hat, y)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_acc += calculate_accuracy(y_true=y, y_hat=y_hat)
        steps += 1

    return train_loss / steps, train_acc / steps


def test_step(model, dataloader, loss_fn, device):

    model.eval()

    all_y_true = []
    all_y_hat = []

    test_loss = 0
    steps = 0

    with torch.inference_mode():
        for X, lengths, y in dataloader:

            X, y = X.to(device), y.to(device)

            y_hat = model(X, lengths)

            all_y_true.extend(y)
            all_y_hat.extend(y_hat)

            loss = loss_fn(y_hat, y)
            test_loss += loss.item()

            steps += 1

        all_y_true = torch.FloatTensor(all_y_true)
        all_y_hat = torch.FloatTensor(all_y_hat)

        test_accuracy = calculate_accuracy(all_y_true, all_y_hat)
        test_f1 = calculate_f1(all_y_true, all_y_hat)
        test_brier = calculate_brier(all_y_true, all_y_hat)
        test_auc_tuple = calculate_auc(all_y_true, all_y_hat)

    return test_loss / steps, test_accuracy, test_f1, test_brier, test_auc_tuple

def train(model,
          train_dataloader,
          test_dataloader,
          optimizer,
          loss_fn,
          epochs,
          device):

    results = {
        "train_loss": [],
        "train_acc": [],
        "test_loss": [],
        "test_acc": [],
        "test_f1": [],
        "test_brier": [],
        "test_auc_tuple": []
    }

    model.to(device)

    for epoch in tqdm(range(epochs)):

        start_time = timer()
        train_loss, train_acc = train_step(
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            device=device,
        )
        end_time = timer()

        test_loss, test_acc, test_f1, test_brier, test_auc_tuple = test_step(
            model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn,
            device=device,
        )

        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)
        results["test_f1"].append(test_f1)
        results["test_brier"].append(test_brier),
        results["test_auc_tuple"].append(test_auc_tuple)

        save_path = os.path.join(MODELS_JSON_FOLDER_PATH, f'all_vs_all_epoch_{epoch}.pt')
        torch.save(model.cpu().state_dict(), save_path) # change model to cpu in order to save it
        model.to(device) # send model back to the device used

        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_acc: {train_acc:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f} | "
            f"test_f1: {test_f1:.4f} | "
            f"test_brier: {test_brier:.4f} | "
            f"time: {(end_time-start_time):.4f}"
        )

    return results

## Model

In [37]:
model = RNN(tokenizer.vocab_size, EMBEDDING_SIZE, LSTM_UNITS, LSTM_LAYERS, device, dropout=0.6).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00008)
loss_fn = nn.BCEWithLogitsLoss()

In [38]:
results = train(
    model,
    train_dataloader,
    test_dataloader,
    optimizer,
    loss_fn,
    epochs=20,
    device=device
)

  5%|▌         | 1/20 [01:50<35:03, 110.73s/it]

Epoch: 1 | train_loss: 0.3009 | train_acc: 92.8558 | test_loss: 0.2243 | test_acc: 92.8603 | test_f1: 0.9630 | test_brier: 0.0599 | time: 96.2959


 10%|█         | 2/20 [03:43<33:38, 112.13s/it]

Epoch: 2 | train_loss: 0.2125 | train_acc: 93.1065 | test_loss: 0.1377 | test_acc: 94.6124 | test_f1: 0.9715 | test_brier: 0.0394 | time: 98.6168


 15%|█▌        | 3/20 [05:35<31:41, 111.84s/it]

Epoch: 3 | train_loss: 0.1201 | train_acc: 95.5518 | test_loss: 0.1124 | test_acc: 95.5103 | test_f1: 0.9757 | test_brier: 0.0327 | time: 97.0387


 20%|██        | 4/20 [07:26<29:43, 111.45s/it]

Epoch: 4 | train_loss: 0.0880 | train_acc: 96.9112 | test_loss: 0.0964 | test_acc: 95.9483 | test_f1: 0.9783 | test_brier: 0.0287 | time: 96.5036


 25%|██▌       | 5/20 [09:15<27:41, 110.78s/it]

Epoch: 5 | train_loss: 0.0683 | train_acc: 97.7075 | test_loss: 0.0905 | test_acc: 96.2768 | test_f1: 0.9800 | test_brier: 0.0273 | time: 95.1633


 30%|███       | 6/20 [11:05<25:47, 110.50s/it]

Epoch: 6 | train_loss: 0.0482 | train_acc: 98.4690 | test_loss: 0.1137 | test_acc: 95.7074 | test_f1: 0.9772 | test_brier: 0.0325 | time: 95.3236


 35%|███▌      | 7/20 [12:55<23:55, 110.42s/it]

Epoch: 7 | train_loss: 0.0416 | train_acc: 98.6674 | test_loss: 0.0883 | test_acc: 96.6272 | test_f1: 0.9819 | test_brier: 0.0257 | time: 95.9111


 40%|████      | 8/20 [14:45<22:01, 110.13s/it]

Epoch: 8 | train_loss: 0.0278 | train_acc: 99.1836 | test_loss: 0.1145 | test_acc: 96.0140 | test_f1: 0.9787 | test_brier: 0.0304 | time: 95.1578


 45%|████▌     | 9/20 [16:35<20:09, 109.98s/it]

Epoch: 9 | train_loss: 0.0274 | train_acc: 99.2680 | test_loss: 0.0963 | test_acc: 96.6710 | test_f1: 0.9820 | test_brier: 0.0265 | time: 95.2465


 50%|█████     | 10/20 [18:25<18:20, 110.05s/it]

Epoch: 10 | train_loss: 0.0252 | train_acc: 99.3337 | test_loss: 0.0973 | test_acc: 96.4301 | test_f1: 0.9807 | test_brier: 0.0267 | time: 95.7295


 55%|█████▌    | 11/20 [20:15<16:29, 109.94s/it]

Epoch: 11 | train_loss: 0.0203 | train_acc: 99.4262 | test_loss: 0.1014 | test_acc: 96.6929 | test_f1: 0.9821 | test_brier: 0.0259 | time: 95.2855


 60%|██████    | 12/20 [22:04<14:37, 109.68s/it]

Epoch: 12 | train_loss: 0.0179 | train_acc: 99.4276 | test_loss: 0.1046 | test_acc: 96.6929 | test_f1: 0.9822 | test_brier: 0.0266 | time: 94.5635


 65%|██████▌   | 13/20 [23:54<12:48, 109.78s/it]

Epoch: 13 | train_loss: 0.0121 | train_acc: 99.7185 | test_loss: 0.1061 | test_acc: 96.7587 | test_f1: 0.9826 | test_brier: 0.0257 | time: 95.6321


 70%|███████   | 14/20 [25:43<10:58, 109.75s/it]

Epoch: 14 | train_loss: 0.0067 | train_acc: 99.8498 | test_loss: 0.1133 | test_acc: 96.6272 | test_f1: 0.9819 | test_brier: 0.0261 | time: 95.2709


 75%|███████▌  | 15/20 [27:33<09:09, 109.82s/it]

Epoch: 15 | train_loss: 0.0076 | train_acc: 99.8592 | test_loss: 0.1163 | test_acc: 96.6710 | test_f1: 0.9822 | test_brier: 0.0269 | time: 95.5389


 80%|████████  | 16/20 [29:23<07:18, 109.74s/it]

Epoch: 16 | train_loss: 0.0052 | train_acc: 99.9236 | test_loss: 0.1202 | test_acc: 96.4520 | test_f1: 0.9809 | test_brier: 0.0277 | time: 95.2150


 85%|████████▌ | 17/20 [31:13<05:29, 109.71s/it]

Epoch: 17 | train_loss: 0.0232 | train_acc: 99.3056 | test_loss: 0.1076 | test_acc: 96.8463 | test_f1: 0.9831 | test_brier: 0.0254 | time: 95.2915


 90%|█████████ | 18/20 [33:02<03:39, 109.64s/it]

Epoch: 18 | train_loss: 0.0112 | train_acc: 99.8485 | test_loss: 0.1500 | test_acc: 96.2549 | test_f1: 0.9801 | test_brier: 0.0318 | time: 95.1456


 95%|█████████▌| 19/20 [34:52<01:49, 109.77s/it]

Epoch: 19 | train_loss: 0.0142 | train_acc: 99.5871 | test_loss: 0.1232 | test_acc: 96.6053 | test_f1: 0.9818 | test_brier: 0.0284 | time: 95.5868


100%|██████████| 20/20 [36:42<00:00, 110.11s/it]

Epoch: 20 | train_loss: 0.0076 | train_acc: 99.8123 | test_loss: 0.1258 | test_acc: 96.7806 | test_f1: 0.9827 | test_brier: 0.0273 | time: 95.2565





## Save results

In [41]:
class json_serialize(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

with open(f'{ROOT_DATA_RAW}/all_vs_all.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4, cls=json_serialize)

In [44]:
!zip -r ./all_vs_all_model.zip ./machines/

  adding: machines/ (stored 0%)
  adding: machines/alpaca-7b.jsonl (deflated 73%)
  adding: machines/all_vs_all_epoch_12.pt (deflated 8%)
  adding: machines/all_vs_all_epoch_17.pt (deflated 8%)
  adding: machines/bigscience-bloomz-7b1.jsonl (deflated 77%)
  adding: machines/all_vs_all_epoch_5.pt (deflated 8%)
  adding: machines/all_vs_all_epoch_1.pt (deflated 8%)
  adding: machines/mistralai-mixtral-8x7b-instruct-v0.1.jsonl (deflated 69%)
  adding: machines/all_vs_all_epoch_16.pt (deflated 8%)
  adding: machines/gpt-3.5-turbo-0125.jsonl (deflated 69%)
  adding: machines/chavinlo-alpaca-13b.jsonl (deflated 77%)
  adding: machines/all_vs_all_epoch_14.pt (deflated 8%)
  adding: machines/all_vs_all_epoch_7.pt (deflated 8%)
  adding: machines/all_vs_all_epoch_4.pt (deflated 8%)
  adding: machines/meta-llama-llama-2-7b-chat-hf.jsonl (deflated 72%)
  adding: machines/all_vs_all_epoch_6.pt (deflated 8%)
  adding: machines/text-bison-002.jsonl (deflated 70%)
  adding: machines/gemini-pro.jsonl 

In [46]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!cp ./all_vs_all_model.zip '/content/gdrive/My Drive/'
!ls -lt '/content/gdrive/My Drive/'