## Imports

In [51]:
import pandas as pd

from torch import nn
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

## Load data

In [3]:
ROOT_DATA_RAW = '../../data/raw'
HUMAN_JSON_PATH = f'{ROOT_DATA_RAW}/human.jsonl'
VICGALLE_GPT2_JSON_PATH = f'{ROOT_DATA_RAW}/machines/vicgalle-gpt2-open-instruct-v1.jsonl'

In [4]:
human_df = pd.read_json(path_or_buf=HUMAN_JSON_PATH, lines=True)
llm_df = pd.read_json(path_or_buf=VICGALLE_GPT2_JSON_PATH, lines=True)

In [5]:
human_df.head()

Unnamed: 0,id,text
0,articles-cleaned-truncated/news-2021-01-01-202...,Inaugural Address by President Joseph R. Biden...
1,articles-cleaned-truncated/news-2021-01-01-202...,Fact check: Biden inauguration impacted by pan...
2,articles-cleaned-truncated/news-2021-01-01-202...,Highlights from Joe Biden's 2021 inauguration\...
3,articles-cleaned-truncated/news-2021-01-01-202...,"Biden takes the helm, appeals for unity to tak..."
4,articles-cleaned-truncated/news-2021-01-01-202...,'The Hill We Climb': Read Amanda Gorman's inau...


In [6]:
llm_df.head()

Unnamed: 0,id,text
0,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,"""America's Future: What Happens to the Constit..."
1,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,President Trump Is Not Present at The 2020 Ina...
2,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,Trump leaves White House with heightened secur...
3,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,Joe Biden is the 46th President of the United ...
4,vicgalle-gpt2-open-instruct-v1/news-2021-01-01...,'Amanda Gorman Celebrates New York Times Poet ...


In [9]:
human_df['is_llm'] = 0
llm_df['is_llm'] = 1

human_df.drop(labels=['id'], inplace=True, axis='columns')
llm_df.drop(labels=['id'], inplace=True, axis='columns')

In [10]:
llm_df.head()

Unnamed: 0,text,is_llm
0,"""America's Future: What Happens to the Constit...",1
1,President Trump Is Not Present at The 2020 Ina...,1
2,Trump leaves White House with heightened secur...,1
3,Joe Biden is the 46th President of the United ...,1
4,'Amanda Gorman Celebrates New York Times Poet ...,1


In [11]:
human_df.head()

Unnamed: 0,text,is_llm
0,Inaugural Address by President Joseph R. Biden...,0
1,Fact check: Biden inauguration impacted by pan...,0
2,Highlights from Joe Biden's 2021 inauguration\...,0
3,"Biden takes the helm, appeals for unity to tak...",0
4,'The Hill We Climb': Read Amanda Gorman's inau...,0


In [16]:
df = pd.concat([human_df, llm_df], ignore_index=True)

In [18]:
df.head()

Unnamed: 0,text,is_llm
0,Inaugural Address by President Joseph R. Biden...,0
1,Fact check: Biden inauguration impacted by pan...,0
2,Highlights from Joe Biden's 2021 inauguration\...,0
3,"Biden takes the helm, appeals for unity to tak...",0
4,'The Hill We Climb': Read Amanda Gorman's inau...,0


In [19]:
df.tail()

Unnamed: 0,text,is_llm
2169,'The Disappearance of Gabby Petito' – A Compre...,1
2170,"Utah State Police Search for Gabby Petito, Tra...",1
2171,McKenna's Lost Friend: Debunking the Evidence ...,1
2172,"""Gunshots Found in Florida Nature Preserve: A ...",1
2173,A Very Kind and Sweet Woman in Long Island Sho...,1


## Tokenize

In [21]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [47]:
print(tokenizer.vocab_size)

28996


In [46]:
df['tokenized_text'] = tokenizer(list(df['text'].to_list()))['input_ids']
tokenized = tokenizer(list(df['text'].to_list()))

Unnamed: 0,text,is_llm,tokenized_text
0,Inaugural Address by President Joseph R. Biden...,0,"[101, 1130, 3984, 13830, 4412, 24930, 18380, 1..."
1,Fact check: Biden inauguration impacted by pan...,0,"[101, 143, 11179, 4031, 131, 139, 26859, 20105..."
2,Highlights from Joe Biden's 2021 inauguration\...,0,"[101, 1693, 13231, 1121, 2658, 139, 26859, 112..."
3,"Biden takes the helm, appeals for unity to tak...",0,"[101, 139, 26859, 2274, 1103, 22778, 117, 1599..."
4,'The Hill We Climb': Read Amanda Gorman's inau...,0,"[101, 112, 1109, 2404, 1284, 140, 24891, 1830,..."


In [42]:
test_tokenized = tokenizer(['Michael is good', 'Peter is good'])
print(tokenizer.tokenize(['Michael is good', 'Peter is good']))
print(test_tokenized.word_ids())
print(test_tokenized['input_ids'])
print(test_tokenized['attention_mask'])

['Michael', 'is', 'good', 'Peter', 'is', 'good']
[None, 0, 1, 2, None]
[[101, 1847, 1110, 1363, 102], [101, 1943, 1110, 1363, 102]]
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]


In [48]:
df.head()

Unnamed: 0,text,is_llm,tokenized_text
0,Inaugural Address by President Joseph R. Biden...,0,"[101, 1130, 3984, 13830, 4412, 24930, 18380, 1..."
1,Fact check: Biden inauguration impacted by pan...,0,"[101, 143, 11179, 4031, 131, 139, 26859, 20105..."
2,Highlights from Joe Biden's 2021 inauguration\...,0,"[101, 1693, 13231, 1121, 2658, 139, 26859, 112..."
3,"Biden takes the helm, appeals for unity to tak...",0,"[101, 139, 26859, 2274, 1103, 22778, 117, 1599..."
4,'The Hill We Climb': Read Amanda Gorman's inau...,0,"[101, 112, 1109, 2404, 1284, 140, 24891, 1830,..."


## Model

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, layers_num, device, output_size=1, dropout=0):
        super().__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.layers_num = layers_num
        self.output_size= output_size
        self.dropout = dropout
        self.device = device

        self.embed = nn.Embedding(self.vocab_size, self.embedding_size, self.device)

        self.lstm = nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.hidden_size,
            num_layers=self.layers_num,
            batch_first=True,
            dropout=self.dropout,
            device=self.device        
        )

        self.fc = nn.Linear(
            self.hidden_size,
            self.output_size
        )

    def forward(self, X, y):
        pass

## Dataset

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.size
    
    def __getitem__(self, index):
        return (
            self.X[index],
            self.y[index]
        )