## Imports

In [32]:
import pandas as pd
import numpy as np

import torch
from tqdm import tqdm
from torch import nn
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from timeit import default_timer as timer

## Load data

In [6]:
ROOT_DATA_RAW = './'
HUMAN_JSON_PATH = f'{ROOT_DATA_RAW}/human.jsonl'
VICGALLE_GPT2_JSON_PATH = f'{ROOT_DATA_RAW}/vicgalle-gpt2-open-instruct-v1.jsonl'
BATCH_SIZE = 32
LSTM_UNITS = 128
LSTM_LAYERS = 5
EMBEDDING_SIZE = 300

In [7]:
human_df = pd.read_json(path_or_buf=HUMAN_JSON_PATH, lines=True)
llm_df = pd.read_json(path_or_buf=VICGALLE_GPT2_JSON_PATH, lines=True)

In [8]:
human_df.head()

Unnamed: 0,id,text
0,articles-cleaned-truncated/news-2021-01-01-202...,Inaugural Address by President Joseph R. Biden...
1,articles-cleaned-truncated/news-2021-01-01-202...,Fact check: Biden inauguration impacted by pan...
2,articles-cleaned-truncated/news-2021-01-01-202...,Highlights from Joe Biden's 2021 inauguration\...
3,articles-cleaned-truncated/news-2021-01-01-202...,"Biden takes the helm, appeals for unity to tak..."
4,articles-cleaned-truncated/news-2021-01-01-202...,'The Hill We Climb': Read Amanda Gorman's inau...


In [9]:
llm_df.head()

Unnamed: 0,id,text
0,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,"""America's Future: What Happens to the Constit..."
1,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,President Trump Is Not Present at The 2020 Ina...
2,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,Trump leaves White House with heightened secur...
3,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,Joe Biden is the 46th President of the United ...
4,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,'Amanda Gorman Celebrates New York Times Poet ...


In [10]:
human_df['is_llm'] = 0
llm_df['is_llm'] = 1

human_df.drop(labels=['id'], inplace=True, axis='columns')
llm_df.drop(labels=['id'], inplace=True, axis='columns')

In [11]:
llm_df.head()

Unnamed: 0,text,is_llm
0,"""America's Future: What Happens to the Constit...",1
1,President Trump Is Not Present at The 2020 Ina...,1
2,Trump leaves White House with heightened secur...,1
3,Joe Biden is the 46th President of the United ...,1
4,'Amanda Gorman Celebrates New York Times Poet ...,1


In [12]:
human_df.head()

Unnamed: 0,text,is_llm
0,Inaugural Address by President Joseph R. Biden...,0
1,Fact check: Biden inauguration impacted by pan...,0
2,Highlights from Joe Biden's 2021 inauguration\...,0
3,"Biden takes the helm, appeals for unity to tak...",0
4,'The Hill We Climb': Read Amanda Gorman's inau...,0


In [13]:
df = pd.concat([human_df, llm_df], ignore_index=True)

In [14]:
df.head()

Unnamed: 0,text,is_llm
0,Inaugural Address by President Joseph R. Biden...,0
1,Fact check: Biden inauguration impacted by pan...,0
2,Highlights from Joe Biden's 2021 inauguration\...,0
3,"Biden takes the helm, appeals for unity to tak...",0
4,'The Hill We Climb': Read Amanda Gorman's inau...,0


In [15]:
df.tail()

Unnamed: 0,text,is_llm
2169,'The Disappearance of Gabby Petito' – A Compre...,1
2170,"Utah State Police Search for Gabby Petito, Tra...",1
2171,McKenna's Lost Friend: Debunking the Evidence ...,1
2172,"""Gunshots Found in Florida Nature Preserve: A ...",1
2173,A Very Kind and Sweet Woman in Long Island Sho...,1


## Tokenize

In [16]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [17]:
print(tokenizer.vocab_size)

28996


In [18]:
df['tokenized_text'] = tokenizer(list(df['text'].to_list()))['input_ids']
tokenized = tokenizer(list(df['text'].to_list()))

Token indices sequence length is longer than the specified maximum sequence length for this model (843 > 512). Running this sequence through the model will result in indexing errors


In [19]:
test_tokenized = tokenizer(['Michael is good', 'Peter is good'])
print(tokenizer.tokenize(['Michael is good', 'Peter is good']))
print(test_tokenized.word_ids())
print(test_tokenized['input_ids'])
print(test_tokenized['attention_mask'])

['Michael', 'is', 'good', 'Peter', 'is', 'good']
[None, 0, 1, 2, None]
[[101, 1847, 1110, 1363, 102], [101, 1943, 1110, 1363, 102]]
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]


In [20]:
df.head()

Unnamed: 0,text,is_llm,tokenized_text
0,Inaugural Address by President Joseph R. Biden...,0,"[101, 1130, 3984, 13830, 4412, 24930, 18380, 1..."
1,Fact check: Biden inauguration impacted by pan...,0,"[101, 143, 11179, 4031, 131, 139, 26859, 20105..."
2,Highlights from Joe Biden's 2021 inauguration\...,0,"[101, 1693, 13231, 1121, 2658, 139, 26859, 112..."
3,"Biden takes the helm, appeals for unity to tak...",0,"[101, 139, 26859, 2274, 1103, 22778, 117, 1599..."
4,'The Hill We Climb': Read Amanda Gorman's inau...,0,"[101, 112, 1109, 2404, 1284, 140, 24891, 1830,..."


## Model

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [56]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, layers_num, output_size=1, dropout=0):
        super().__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.layers_num = layers_num
        self.output_size= output_size
        self.dropout = dropout
        self.device = device

        self.embed = nn.Embedding(self.vocab_size, self.embedding_size, device=self.device)

        self.lstm = nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.hidden_size,
            num_layers=self.layers_num,
            batch_first=True,
            dropout=self.dropout,
            device=self.device
        )

        self.fc = nn.Linear(
            self.hidden_size,
            self.output_size
        )

    def forward(self, X, lengths):
        embeddings = self.embed(X)

        # padded_input = nn.utils.rnn.pack_padded_sequence(embeddings, lengths.cpu(), batch_first=True)
        seq_output, (h_n, c_n) = self.lstm(embeddings)
        # seq_output, _ = nn.utils.rnn.pad_packed_sequence(seq_output, batch_first=True)

        out = seq_output.sum(dim=1).div(lengths.float().unsqueeze(dim=1))
        logits = self.fc(out)
        return logits

model = RNN(tokenizer.vocab_size, EMBEDDING_SIZE, LSTM_UNITS, LSTM_LAYERS, dropout=0.5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = nn.BCEWithLogitsLoss()

## Dataset

In [23]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.size

    def __getitem__(self, index):
        return (
            self.X[index],
            self.y[index]
        )

In [54]:
dataset = TextDataset(df['tokenized_text'], df['is_llm'])
test_set_size = int(dataset.__len__() * 0.3)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [
    dataset.__len__() - test_set_size,
    test_set_size,
])

In [38]:
def collate_fn(batch):
  # We want to sort the batch by seq length,
  # in order to make the computation more efficient
  batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)

  inputs = [torch.LongTensor(x[0]).to(device) for x in batch]
  padded_input = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
  lengths = torch.LongTensor([len(x[0]) for x in batch]).to(device)
  y = torch.FloatTensor(np.array([x[1] for x in batch])).reshape(-1, 1).to(device)
  return padded_input, lengths, y

In [55]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, drop_last=False,
                              collate_fn=collate_fn
                        )

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                      shuffle=False, drop_last=False,
                        collate_fn=collate_fn
                      )


## Training & evaluation

In [57]:
def calculate_accuracy(y_true, y_hat):
    correct_pred = torch.eq(torch.sigmoid(y_hat).round(), y_true).sum().item()
    return (correct_pred / len(y_hat)) * 100

def evaluate(model, test_dataloader, device):
  model.eval()
  test_accuracy = 0

  for x, l, y in test_dataloader:
    with torch.no_grad():
      x, y = x.to(device), y.to(device)

      y_hat = model(x, l)
      test_accuracy += calculate_accuracy(y, y_hat)

  return test_accuracy / len(test_dataloader)

def train(model, optimizer, train_dataloader, test_dataloader, device, max_epochs=2):
  results = {
    'train_loss': [],
    'train_acc': [],
    'test_acc': []
  }

  for epoch in tqdm(range(1, max_epochs+1)):
    start_time = timer()

    model.train()

    results['train_loss'].append([])
    results['train_acc'].append([])
    results['test_acc'].append([])

    for x, l, y in train_dataloader:
      x, y = x.to(device), y.to(device)

      optimizer.zero_grad()
      y_hat = model(x, l)

      bce = nn.BCEWithLogitsLoss()
      cost = bce(y_hat, y)

      cost.backward()
      optimizer.step()

      train_acc = calculate_accuracy(y, y_hat)

      results['train_loss'][-1].append(cost.item())
      results['train_acc'][-1].append(train_acc)

      end_time = timer()
    # Display logs per each DISPLAY_STEP
    if (epoch) % 1 == 0:
      print(
          "\n"
          f"Epoch: {epoch} | "
          f"train_loss={np.mean(results['train_loss'][-1])} | "
          f"train_acc={np.mean(results['train_acc'][-1])} | "
          f"test_acc={evaluate(model, test_dataloader, device)} | "
          f"time: {(end_time-start_time):.4f}")

  return results

costs = train(model, optimizer, train_dataloader, test_dataloader, device, max_epochs=40)
print ("Optimization Finished!")

  2%|▎         | 1/40 [00:04<02:55,  4.51s/it]


Epoch: 1 | train_loss=0.6943993642926216 | train_acc=50.3833912037037 | test_acc=48.90873015873016 | time: 3.8476


  5%|▌         | 2/40 [00:08<02:49,  4.47s/it]


Epoch: 2 | train_loss=0.6930793796976408 | train_acc=50.5859375 | test_acc=48.90873015873016 | time: 3.8424


  8%|▊         | 3/40 [00:13<02:45,  4.47s/it]


Epoch: 3 | train_loss=0.6930247321724892 | train_acc=50.535300925925924 | test_acc=48.90873015873016 | time: 3.8662


 10%|█         | 4/40 [00:18<02:42,  4.51s/it]


Epoch: 4 | train_loss=0.6942486626406511 | train_acc=50.37615740740741 | test_acc=48.90873015873016 | time: 3.9595


 12%|█▎        | 5/40 [00:22<02:37,  4.50s/it]


Epoch: 5 | train_loss=0.693755234281222 | train_acc=50.43402777777777 | test_acc=48.90873015873016 | time: 3.8529


 15%|█▌        | 6/40 [00:27<02:33,  4.53s/it]


Epoch: 6 | train_loss=0.6924842074513435 | train_acc=50.535300925925924 | test_acc=48.90873015873016 | time: 3.9423


 18%|█▊        | 7/40 [00:31<02:29,  4.52s/it]


Epoch: 7 | train_loss=0.6928347982466221 | train_acc=50.43402777777777 | test_acc=48.90873015873016 | time: 3.8966


 20%|██        | 8/40 [00:36<02:24,  4.52s/it]


Epoch: 8 | train_loss=0.6802679784595966 | train_acc=50.96209490740741 | test_acc=64.98015873015872 | time: 3.8883


 22%|██▎       | 9/40 [00:40<02:18,  4.48s/it]


Epoch: 9 | train_loss=0.5230273421232899 | train_acc=81.94444444444444 | test_acc=81.59722222222223 | time: 3.7385


 25%|██▌       | 10/40 [00:44<02:14,  4.49s/it]


Epoch: 10 | train_loss=0.3978083999827504 | train_acc=90.91435185185185 | test_acc=87.89682539682539 | time: 3.8851


 28%|██▊       | 11/40 [00:49<02:09,  4.46s/it]


Epoch: 11 | train_loss=0.6498852013610303 | train_acc=91.72453703703702 | test_acc=77.9265873015873 | time: 3.7874


 30%|███       | 12/40 [00:53<02:05,  4.47s/it]


Epoch: 12 | train_loss=0.6003240371743838 | train_acc=87.08767361111113 | test_acc=87.25198412698413 | time: 3.8352


 32%|███▎      | 13/40 [00:58<02:00,  4.45s/it]


Epoch: 13 | train_loss=0.4108760394155979 | train_acc=90.26331018518518 | test_acc=85.61507936507937 | time: 3.8073


 35%|███▌      | 14/40 [01:02<01:55,  4.45s/it]


Epoch: 14 | train_loss=0.36085337617745 | train_acc=90.74797453703702 | test_acc=85.46626984126985 | time: 3.8291


 38%|███▊      | 15/40 [01:07<01:50,  4.44s/it]


Epoch: 15 | train_loss=0.37002640931556624 | train_acc=88.34635416666667 | test_acc=88.39285714285714 | time: 3.7982


 40%|████      | 16/40 [01:11<01:46,  4.43s/it]


Epoch: 16 | train_loss=0.227445089413474 | train_acc=94.5457175925926 | test_acc=89.28571428571429 | time: 3.8066


 42%|████▎     | 17/40 [01:15<01:41,  4.43s/it]


Epoch: 17 | train_loss=0.17455443677802882 | train_acc=95.96354166666667 | test_acc=90.625 | time: 3.8237


 45%|████▌     | 18/40 [01:20<01:37,  4.43s/it]


Epoch: 18 | train_loss=0.134260225109756 | train_acc=96.8894675925926 | test_acc=89.28571428571429 | time: 3.8271


 48%|████▊     | 19/40 [01:24<01:32,  4.42s/it]


Epoch: 19 | train_loss=0.12997857426914075 | train_acc=96.484375 | test_acc=89.28571428571429 | time: 3.7684


 50%|█████     | 20/40 [01:29<01:28,  4.44s/it]


Epoch: 20 | train_loss=0.11960807784150045 | train_acc=96.54947916666667 | test_acc=90.32738095238095 | time: 3.8291


 52%|█████▎    | 21/40 [01:33<01:24,  4.44s/it]


Epoch: 21 | train_loss=0.11510474568543334 | train_acc=97.265625 | test_acc=87.10317460317461 | time: 3.8410


 55%|█████▌    | 22/40 [01:38<01:19,  4.44s/it]


Epoch: 22 | train_loss=0.11741897471559544 | train_acc=96.94010416666667 | test_acc=88.14484126984128 | time: 3.8157


 57%|█████▊    | 23/40 [01:42<01:15,  4.46s/it]


Epoch: 23 | train_loss=0.12154127358614157 | train_acc=97.41030092592592 | test_acc=88.98809523809524 | time: 3.8430


 60%|██████    | 24/40 [01:47<01:11,  4.44s/it]


Epoch: 24 | train_loss=0.0937738020826752 | train_acc=97.65625 | test_acc=89.58333333333333 | time: 3.7828


 62%|██████▎   | 25/40 [01:51<01:06,  4.45s/it]


Epoch: 25 | train_loss=0.26633906131610274 | train_acc=91.47135416666667 | test_acc=58.13492063492063 | time: 3.8745


 65%|██████▌   | 26/40 [01:56<01:02,  4.46s/it]


Epoch: 26 | train_loss=0.2644668792684873 | train_acc=92.72280092592592 | test_acc=90.47619047619048 | time: 3.8702


 68%|██████▊   | 27/40 [02:00<00:57,  4.46s/it]


Epoch: 27 | train_loss=0.11255833522106211 | train_acc=97.41030092592592 | test_acc=90.92261904761905 | time: 3.8434


 70%|███████   | 28/40 [02:04<00:53,  4.45s/it]


Epoch: 28 | train_loss=0.09127949054042499 | train_acc=97.4609375 | test_acc=90.625 | time: 3.8142


 72%|███████▎  | 29/40 [02:09<00:49,  4.46s/it]


Epoch: 29 | train_loss=0.08383137849159539 | train_acc=97.59114583333333 | test_acc=91.07142857142857 | time: 3.8709


 75%|███████▌  | 30/40 [02:13<00:44,  4.44s/it]


Epoch: 30 | train_loss=0.11229854207097863 | train_acc=96.94010416666667 | test_acc=91.2202380952381 | time: 3.7760


 78%|███████▊  | 31/40 [02:18<00:40,  4.45s/it]


Epoch: 31 | train_loss=0.11726413562428206 | train_acc=96.63628472222221 | test_acc=77.9265873015873 | time: 3.8195


 80%|████████  | 32/40 [02:22<00:35,  4.45s/it]


Epoch: 32 | train_loss=0.20656966177436212 | train_acc=93.75 | test_acc=90.0297619047619 | time: 3.8356


 82%|████████▎ | 33/40 [02:27<00:31,  4.44s/it]


Epoch: 33 | train_loss=0.11210277210921049 | train_acc=96.82436342592592 | test_acc=91.2202380952381 | time: 3.8071


 85%|████████▌ | 34/40 [02:31<00:26,  4.46s/it]


Epoch: 34 | train_loss=0.1063033650085951 | train_acc=96.8894675925926 | test_acc=90.77380952380952 | time: 3.8240


 88%|████████▊ | 35/40 [02:36<00:22,  4.45s/it]


Epoch: 35 | train_loss=0.08656337337257962 | train_acc=97.80092592592592 | test_acc=91.2202380952381 | time: 3.8094


 90%|█████████ | 36/40 [02:40<00:17,  4.43s/it]


Epoch: 36 | train_loss=0.07473574456525967 | train_acc=98.50260416666667 | test_acc=91.07142857142857 | time: 3.7788


 92%|█████████▎| 37/40 [02:44<00:13,  4.43s/it]


Epoch: 37 | train_loss=0.06324800901347771 | train_acc=98.32175925925925 | test_acc=90.92261904761905 | time: 3.7808


 95%|█████████▌| 38/40 [02:49<00:08,  4.42s/it]


Epoch: 38 | train_loss=0.06504459622859333 | train_acc=98.19155092592592 | test_acc=90.625 | time: 3.7827


 98%|█████████▊| 39/40 [02:53<00:04,  4.43s/it]


Epoch: 39 | train_loss=0.054917101631872356 | train_acc=98.6328125 | test_acc=90.92261904761905 | time: 3.8599


100%|██████████| 40/40 [02:58<00:00,  4.46s/it]


Epoch: 40 | train_loss=0.06442605143335338 | train_acc=98.30729166666667 | test_acc=90.625 | time: 3.8867
Optimization Finished!





## Test with meta-llama-2-7b

In [59]:
META_LLAMA_JSON_PATH = f'{ROOT_DATA_RAW}/meta-llama-llama-2-7b-chat-hf.jsonl'

llama_df = pd.read_json(path_or_buf=META_LLAMA_JSON_PATH, lines=True)
llama_df['is_llm'] = 1

llama_df.drop(labels=['id'], inplace=True, axis='columns')

llama_df['tokenized_text'] = tokenizer(list(llama_df['text'].to_list()))['input_ids']

llama_test_dataset = TextDataset(llama_df['tokenized_text'], llama_df['is_llm'])
llama_test_dataloader = DataLoader(llama_test_dataset, batch_size=BATCH_SIZE,
                      shuffle=False, drop_last=False,
                      collate_fn=collate_fn
                    )

In [62]:
print(f"test_acc={evaluate(model, llama_test_dataloader, device)}")

test_acc=47.64290796963947
