## Imports

In [30]:
import pandas as pd
import numpy as np

import torch
from tqdm import tqdm 
from torch import nn
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

## Load data

In [2]:
ROOT_DATA_RAW = '../../data/raw'
HUMAN_JSON_PATH = f'{ROOT_DATA_RAW}/human.jsonl'
VICGALLE_GPT2_JSON_PATH = f'{ROOT_DATA_RAW}/machines/vicgalle-gpt2-open-instruct-v1.jsonl'
BATCH_SIZE = 32
LSTM_UNITS = 128
LSTM_LAYERS = 5
EMBEDDING_SIZE = 300

In [3]:
human_df = pd.read_json(path_or_buf=HUMAN_JSON_PATH, lines=True)
llm_df = pd.read_json(path_or_buf=VICGALLE_GPT2_JSON_PATH, lines=True)

In [4]:
human_df.head()

Unnamed: 0,id,text
0,articles-cleaned-truncated/news-2021-01-01-202...,Inaugural Address by President Joseph R. Biden...
1,articles-cleaned-truncated/news-2021-01-01-202...,Fact check: Biden inauguration impacted by pan...
2,articles-cleaned-truncated/news-2021-01-01-202...,Highlights from Joe Biden's 2021 inauguration\...
3,articles-cleaned-truncated/news-2021-01-01-202...,"Biden takes the helm, appeals for unity to tak..."
4,articles-cleaned-truncated/news-2021-01-01-202...,'The Hill We Climb': Read Amanda Gorman's inau...


In [5]:
llm_df.head()

Unnamed: 0,id,text
0,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,"""America's Future: What Happens to the Constit..."
1,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,President Trump Is Not Present at The 2020 Ina...
2,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,Trump leaves White House with heightened secur...
3,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,Joe Biden is the 46th President of the United ...
4,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,'Amanda Gorman Celebrates New York Times Poet ...


In [6]:
human_df['is_llm'] = 0
llm_df['is_llm'] = 1

human_df.drop(labels=['id'], inplace=True, axis='columns')
llm_df.drop(labels=['id'], inplace=True, axis='columns')

In [7]:
llm_df.head()

Unnamed: 0,text,is_llm
0,"""America's Future: What Happens to the Constit...",1
1,President Trump Is Not Present at The 2020 Ina...,1
2,Trump leaves White House with heightened secur...,1
3,Joe Biden is the 46th President of the United ...,1
4,'Amanda Gorman Celebrates New York Times Poet ...,1


In [8]:
human_df.head()

Unnamed: 0,text,is_llm
0,Inaugural Address by President Joseph R. Biden...,0
1,Fact check: Biden inauguration impacted by pan...,0
2,Highlights from Joe Biden's 2021 inauguration\...,0
3,"Biden takes the helm, appeals for unity to tak...",0
4,'The Hill We Climb': Read Amanda Gorman's inau...,0


In [9]:
df = pd.concat([human_df, llm_df], ignore_index=True)

In [10]:
df.head()

Unnamed: 0,text,is_llm
0,Inaugural Address by President Joseph R. Biden...,0
1,Fact check: Biden inauguration impacted by pan...,0
2,Highlights from Joe Biden's 2021 inauguration\...,0
3,"Biden takes the helm, appeals for unity to tak...",0
4,'The Hill We Climb': Read Amanda Gorman's inau...,0


In [11]:
df.tail()

Unnamed: 0,text,is_llm
2169,'The Disappearance of Gabby Petito' – A Compre...,1
2170,"Utah State Police Search for Gabby Petito, Tra...",1
2171,McKenna's Lost Friend: Debunking the Evidence ...,1
2172,"""Gunshots Found in Florida Nature Preserve: A ...",1
2173,A Very Kind and Sweet Woman in Long Island Sho...,1


## Tokenize

In [12]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
print(tokenizer.vocab_size)

28996


In [14]:
df['tokenized_text'] = tokenizer(list(df['text'].to_list()))['input_ids']
tokenized = tokenizer(list(df['text'].to_list()))

Token indices sequence length is longer than the specified maximum sequence length for this model (843 > 512). Running this sequence through the model will result in indexing errors


In [15]:
test_tokenized = tokenizer(['Michael is good', 'Peter is good'])
print(tokenizer.tokenize(['Michael is good', 'Peter is good']))
print(test_tokenized.word_ids())
print(test_tokenized['input_ids'])
print(test_tokenized['attention_mask'])

['Michael', 'is', 'good', 'Peter', 'is', 'good']
[None, 0, 1, 2, None]
[[101, 1847, 1110, 1363, 102], [101, 1943, 1110, 1363, 102]]
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]


In [16]:
df.head()

Unnamed: 0,text,is_llm,tokenized_text
0,Inaugural Address by President Joseph R. Biden...,0,"[101, 1130, 3984, 13830, 4412, 24930, 18380, 1..."
1,Fact check: Biden inauguration impacted by pan...,0,"[101, 143, 11179, 4031, 131, 139, 26859, 20105..."
2,Highlights from Joe Biden's 2021 inauguration\...,0,"[101, 1693, 13231, 1121, 2658, 139, 26859, 112..."
3,"Biden takes the helm, appeals for unity to tak...",0,"[101, 139, 26859, 2274, 1103, 22778, 117, 1599..."
4,'The Hill We Climb': Read Amanda Gorman's inau...,0,"[101, 112, 1109, 2404, 1284, 140, 24891, 1830,..."


## Model

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, layers_num, output_size=1, dropout=0):
        super().__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.layers_num = layers_num
        self.output_size= output_size
        self.dropout = dropout
        self.device = device

        self.embed = nn.Embedding(self.vocab_size, self.embedding_size, device=self.device)

        self.lstm = nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.hidden_size,
            num_layers=self.layers_num,
            batch_first=True,
            dropout=self.dropout,
            device=self.device        
        )

        self.fc = nn.Linear(
            self.hidden_size,
            self.output_size
        )

    def forward(self, X, lengths):
        embeddings = self.embed(X)
        padded_input = nn.utils.rnn.pack_padded_sequence(embeddings, lengths, batch_first=True)

        seq_output, (h_n, c_n) = self.lstm(padded_input)
        seq_output, _ = nn.utils.rnn.pad_packed_sequence(seq_output, batch_first=True)
        out = seq_output.sum(dim=1).div(lengths.float().unsqueeze(dim=1))
        logits = self.fc(out)
        return logits
    
model = RNN(tokenizer.vocab_size, EMBEDDING_SIZE, LSTM_UNITS, LSTM_LAYERS)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

## Dataset

In [20]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.size
    
    def __getitem__(self, index):
        return (
            self.X[index],
            self.y[index]
        )

In [21]:
dataset = TextDataset(df['tokenized_text'], df['is_llm'])
test_set_size = int(dataset.__len__() * 0.2)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [
    dataset.__len__() - test_set_size,
    test_set_size,
])

## Training & evaluation

In [29]:
def collate_fn(batch):
  # We want to sort the batch by seq length, 
  # in order to make the computation more efficient
  batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
  
  inputs = [torch.LongTensor(x[0]).to(device) for x in batch]
  padded_input = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
  lengths = torch.LongTensor([len(x[0]) for x in batch]).to(device)
  y = torch.FloatTensor(np.array([x[1] for x in batch])).reshape(-1, 1).to(device)
  return padded_input, lengths, y

def evaluate(model, test_dataset):
  model.eval()
  correct_pred = 0
  dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                      shuffle=False, drop_last=False,
                        collate_fn=collate_fn
                      )
  
  for x, l, y in tqdm(dataloader, total=len(dataloader)):
    with torch.no_grad():
      y_hat = model(x, l)
      correct_pred += torch.eq(torch.sigmoid(y_hat).round(), y).sum().item()
      
  return correct_pred / len(dataset)

def train(model, optimizer, dataset, test_dataset, max_epochs=2):
  costs = []
  for epoch in range(1, max_epochs+1):
    model.train()
    costs.append([])
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE,
                        shuffle=True, drop_last=False,
                        collate_fn=collate_fn
                        )
    for x, l, y in tqdm(dataloader, total=len(dataloader)):
      optimizer.zero_grad()
      y_hat = model(x, l)

      bce = nn.BCEWithLogitsLoss()
      cost = bce(y_hat, y)

      cost.backward()
      optimizer.step()

      costs[-1].append(cost.item())
    # Display logs per each DISPLAY_STEP
    if (epoch) % 1 == 0:
      print("Epoch: {:04d} mean cost={:.9f}".format(epoch, np.mean(costs[-1])))
      print(f"Accuracy on test: {evaluate(model, test_dataset)}")
  return costs

costs = train(model, optimizer, train_dataset, test_dataset, max_epochs=2)
print ("Optimization Finished!")

100%|██████████| 55/55 [3:13:57<00:00, 211.59s/it]    


Epoch: 0001 mean cost=0.560837176


100%|██████████| 14/14 [00:18<00:00,  1.29s/it]


Accuracy on test: 0.16191352345906163


  7%|▋         | 4/55 [05:16<1:07:13, 79.09s/it]


KeyboardInterrupt: 

In [None]:
pd.DataFrame(chain(*costs), columns=['loss']).rolling(10).mean().plot()