In [None]:
import pandas as pd
from glob import glob
import random
random.seed(0)

# Data Loading

Loading and cleaning up GPT-3 First

In [None]:
gpt3 = pd.read_json("./data/gpt3_175b_samples.jsonl", lines=True)

# Fixing the dataset myself since <|endoftext|> is included erroneously.  OpenAI should probably accept pull requests for this sort of thing...
gpt3_rebuild = []

for f in gpt3[0]:
    split = f.split("<|endoftext|>")
    for s in split:
        gpt3_rebuild.append(s)

gpt3_df = pd.DataFrame(gpt3_rebuild)

In [None]:
gpt3_df

Now we have the GPT-3 samples, but it isn't really enough for training.  We're just going to use this as test data.

Now we move on to GPT-2 datasets.

In [None]:
gpt2_k40_files = glob("./data/gpt-2-output-dataset/data/*k40.train.jsonl")

In [None]:
gpt2_k40_files

In [None]:
gpt2_1542m = pd.read_json("./data/gpt-2-output-dataset/data/xl-1542M-k40.train.jsonl", lines=True)

And finally we load the regular webtext dataset.

In [None]:
webtext_train = pd.read_json("./data/gpt-2-output-dataset/data/webtext.train.jsonl", lines=True)

In [None]:
webtext_train.head(1)['text']

In [None]:
webtext_train['text']

Everything works properly.  We'll use this code for loading different datasets for experiments.

# Create GPT2-355M parameter sample

In [None]:
gpt2_355m_k40 = pd.read_json("./data/gpt-2-output-dataset/data/medium-345M-k40.train.jsonl", lines=True)
webtext = pd.read_json("./data/gpt-2-output-dataset/data/webtext.train.jsonl", lines=True)
gpt2_355m_k40_test = pd.read_json("./data/gpt-2-output-dataset/data/medium-345M-k40.test.jsonl", lines=True)
webtext_test = pd.read_json("./data/gpt-2-output-dataset/data/webtext.test.jsonl", lines=True)

We need to format the data like this for Grover to use it

In [None]:
template = pd.read_json("./data/grover/generator=mega~dataset=p1.00.jsonl", lines=True)

In [None]:
template.head(1)

In [None]:
gpt2_355m_k40_test["label"] = 'machine'

In [None]:
webtext_test["label"] = 'human'

In [None]:
gpt2_355_test_set = gpt2_355m_k40_test.append(webtext_test, ignore_index=True)

In [None]:
gpt2_355_test_set = gpt2_355_test_set.rename(columns={"text": "article"})

In [None]:
gpt2_355_test_set["split"] = 'test'

In [None]:
gpt2_355_test_set

In [None]:
gpt2_355_test_set.to_json("./classification_data/gpt2_355m_test_set_grover.jsonl", lines=True, orient="records")

# Create GPT-2 1.5B Sample
For Grover

In [None]:
gpt2_1532M_test = pd.read_json("./data/gpt-2-output-dataset/data/xl-1542M-k40.test.jsonl", lines=True)
webtext_test = pd.read_json("./data/gpt-2-output-dataset/data/webtext.test.jsonl", lines=True)

gpt2_1532M_test["label"] = 'machine'
webtext_test["label"] = 'human'

gpt2_1532M_test_set = gpt2_1532M_test.append(webtext_test, ignore_index=True)

gpt2_1532M_test_set = gpt2_1532M_test_set.rename(columns={"text": "article"})

gpt2_1532M_test_set["split"] = 'test'

gpt2_1532M_test_set.to_json("./classification_data/gpt2_1532m_test_set_grover.jsonl", lines=True, orient="records")

# Create GPT-3 175B Sample

In [None]:
gpt3 = pd.read_json("./data/gpt3_175b_samples.jsonl", lines=True)

# Fixing the dataset since <|endoftext|> is included erroneously.
gpt3_rebuild = []

for f in gpt3[0]:
    split = f.split("<|endoftext|>")
    for s in split:
        gpt3_rebuild.append(s)

gpt3_df = pd.DataFrame(gpt3_rebuild)

In [None]:
gpt3_df = gpt3_df.rename(columns={0: "text"})

In [None]:
gpt3_test = gpt3_df
webtext_test = pd.read_json("./data/gpt-2-output-dataset/data/webtext.test.jsonl", lines=True)
gpt3_test["label"] = 'machine'
webtext_test["label"] = 'human'
gpt3_test_set = gpt3_test.append(webtext_test.sample(gpt3_test.shape[0]), ignore_index=True)
gpt3_test_set = gpt3_test_set.rename(columns={"text": "article"})
gpt3_test_set["split"] = 'test'

gpt3_test_set.to_json("./classification_data/gpt3_test_set_grover.jsonl", lines=True, orient="records")

In [None]:
gpt3_test_set