In [2]:
!ls *.csv

GPT-2-arxiv_generate.csv                opus-es-en-scielo-backtranslations.csv
bloom_generate.csv                      pegasus-xsum-finetuned-paws-parasci.csv
bloom_passages.csv                      pegasus-xsum-finetuned-paws.csv
distilgpt2_generate.csv                 real_passages_100k.csv
google_zhen_backtranslate.csv           scigen_generate.csv
gpt2_generate.csv                       spinbot_paraphrases.csv
opus-es-en-backtranslations.csv


In [1]:
import pandas as pd

In [4]:
real_passages_100k = pd.read_csv('real_passages_100k.csv')

In [6]:
real_passages_100k['fake'] = 0
real_passages_100k['tool'] = 'real'
real_passages_100k['model'] = 'real'
real_passages_100k['type'] = 'real'

In [9]:
generations = [
    {"fake": 1, "type": "gpt2", "tool": "generate", "model": "GPT-2-arxiv_generate", "file": "GPT-2-arxiv_generate.csv"},
    {"fake": 1, "type": "opus", "tool": "translate", "model": "opus-es-en-scielo", "file": "opus-es-en-scielo-backtranslations.csv"},
    {"fake": 1, "type": "bloom", "tool": "generate", "model": "bloom", "file": "bloom_generate.csv"},
    {"fake": 1, "type": "pegasus", "tool": "paraphrase", "model": "pegasus-xsum-finetuned-paws-parasci", "file": "pegasus-xsum-finetuned-paws-parasci.csv"},
    {"fake": 1, "type": "pegasus", "tool": "paraphrase", "model": "pegasus-xsum-finetuned-paws", "file": "pegasus-xsum-finetuned-paws.csv"},
    {"fake": 1, "type": "gpt2", "tool": "generate", "model": "distilgpt2", "file": "distilgpt2_generate.csv"                 },
    {"fake": 1, "type": "google_translate", "tool": "translate", "model": "google_translate", "file": "google_zhen_backtranslate.csv"},
    {"fake": 1, "type": "scigen", "tool": "generate", "model": "scigen", "file": "scigen_generate.csv"},
    {"fake": 1, "type": "gpt2", "tool": "generate", "model": "gpt2-medium", "file": "gpt2_generate.csv"},
    {"fake": 1, "type": "spinbot","tool": "paraphrase", "model": "spinbot", "file": "spinbot_paraphrases.csv"},
    {"fake": 1, "type": "opus", "tool": "translate", "model": "opus-es-en", "file": "opus-es-en-backtranslations.csv"},
]

In [11]:
dfs = []
for generation in generations:
    df = pd.read_csv(
        generation['file']
    )
    df['fake'] = generation['fake']
    df['type'] = generation['type']
    df['model'] = generation['model']
    df['tool'] = generation['tool']
    dfs.append(
        df
    )

In [13]:
master_df = pd.concat(
    dfs + [real_passages_100k]
)

In [30]:
from datasets import Dataset
dataset = Dataset.from_pandas(master_df).remove_columns('__index_level_0__')
dataset = dataset.train_test_split(test_size=0.2, seed=42)
eval_dataset = dataset['test'].train_test_split(test_size=0.5)
dataset['test'] = eval_dataset['test']
dataset['validation'] = eval_dataset['train']

In [34]:
import pandas as pd
df_test = pd.DataFrame( dataset['test'] )
df_test = df_test.dropna()
df_val = pd.DataFrame( dataset['validation'] )
df_val = df_val.dropna()
df_train = pd.DataFrame( dataset['train'] )
df_train = df_train.dropna()
df_train.to_csv('../data/real_and_fake_passages_dataset_train.csv', index=None)
df_test.to_csv('../data/real_and_fake_passages_dataset_test.csv', index=None)
df_val.to_csv('../data/real_and_fake_passages_dataset_validation.csv', index=None)

In [14]:
master_df.to_csv('./real_and_fake_passages_dataset.csv')

## Characterization

In [2]:
master_df = pd.read_csv('./real_and_fake_passages_dataset.csv')

In [3]:
len(master_df)

110474

In [4]:
len(master_df[master_df['fake'] == 1])

10485

In [5]:
len(master_df[master_df['fake'] == 0])

99989

In [6]:
master_df.groupby(by='type').count()

Unnamed: 0_level_0,Unnamed: 0,passages,fake,model,tool
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bloom,1073,1073,1073,1073,1073
google_translate,901,901,901,901,901
gpt2,2994,2994,2994,2994,2994
opus,1695,1695,1695,1695,1695
pegasus,2000,2000,2000,2000,2000
real,99989,99064,99989,99989,99989
scigen,822,822,822,822,822
spinbot,1000,990,1000,1000,1000


In [7]:
master_df.groupby(by='tool').count()

Unnamed: 0_level_0,Unnamed: 0,passages,fake,type,model
tool,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
generate,4889,4889,4889,4889,4889
paraphrase,3000,2990,3000,3000,3000
real,99989,99064,99989,99989,99989
translate,2596,2596,2596,2596,2596


In [8]:
master_df.groupby(by='model').count()

Unnamed: 0_level_0,Unnamed: 0,passages,fake,type,tool
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GPT-2-arxiv_generate,998,998,998,998,998
bloom,1073,1073,1073,1073,1073
distilgpt2,998,998,998,998,998
google_translate,901,901,901,901,901
gpt2-medium,998,998,998,998,998
opus-es-en,794,794,794,794,794
opus-es-en-scielo,901,901,901,901,901
pegasus-xsum-finetuned-paws,1000,1000,1000,1000,1000
pegasus-xsum-finetuned-paws-parasci,1000,1000,1000,1000,1000
real,99989,99064,99989,99989,99989


In [11]:
master_df.groupby(by=['tool', 'type', 'model']).count()[['passages']].to_csv('./dataset_characterization.csv')