In [2]:
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm



### RuEn Data Filtering

In [4]:
# ruen dataset
ruen_train = pd.read_csv('../data/ruen_train_eval.txt', sep='\t')
ruen_dev = pd.read_csv('../data/ruen_valid_eval.txt', sep='\t')
ruen_test = pd.read_csv('../data/ruen_test_eval.txt', sep='\t')

In [5]:
ruen_train_sel = ruen_train[(ruen_train['preds'] == 0) & (ruen_train['sem_similarity'] > .5)]
ruen_dev_sel = ruen_dev[(ruen_dev['preds'] == 0) & (ruen_dev['sem_similarity'] > .1)]
ruen_test_sel = ruen_test[(ruen_test['preds'] == 0) & (ruen_test['sem_similarity'] > .25)]


### FrEn Data Filtering

In [7]:
# iden dataset
fren_train = pd.read_csv('../data/fren_train_eval.txt', sep='\t')
fren_dev = pd.read_csv('../data/fren_valid_eval.txt', sep='\t')
fren_test = pd.read_csv('../data/fren_test_eval.txt', sep='\t')

In [8]:
fren_train_sel = fren_train[(fren_train['preds'] == 0) & (fren_train['sem_similarity'] > .5)]
fren_dev_sel = fren_dev[(fren_dev['preds'] == 0) & (fren_dev['sem_similarity'] > .27)]
fren_test_sel = fren_test[(fren_test['preds'] == 0) & (fren_test['sem_similarity'] > .25)]

### EsEn Data Filtering

In [9]:
# iden dataset
esen_train = pd.read_csv('../data/esen_train_eval.txt', sep='\t')
esen_dev = pd.read_csv('../data/esen_valid_eval.txt', sep='\t')
esen_test = pd.read_csv('../data/esen_test_eval.txt', sep='\t')

In [10]:
esen_train_sel = esen_train[(esen_train['preds'] == 0) & (esen_train['sem_similarity'] > .5)]
esen_dev_sel = esen_dev[(esen_dev['preds'] == 0) & (esen_dev['sem_similarity'] > .3)]
esen_test_sel = esen_test[(esen_test['preds'] == 0) & (esen_test['sem_similarity'] > .3)]

### Synthetic Paraphrase Data Creation

In [111]:
df_paraphrase_ref = pd.concat([
    ruen_train_sel[['source', 'backtranslate']],
    ruen_dev_sel[['source', 'backtranslate']],
    ruen_test_sel[['source', 'backtranslate']],
    fren_train_sel[['source', 'backtranslate']],
    fren_dev_sel[['source', 'backtranslate']],
    fren_test_sel[['source', 'backtranslate']],
    esen_train_sel[['source', 'backtranslate']],
    esen_dev_sel[['source', 'backtranslate']],
    esen_test_sel[['source', 'backtranslate']],
], axis=0)

In [115]:
# export filtered dataset for training GPT-2 paraphraser
df_paraphrase_ref.to_csv('../../data/paraphrase/paraphrase_ref.txt', sep='\t', index=False, header=True)

In [122]:
ruen_train[ruen_train['preds'] == 1][['source', 'backtranslate']].to_csv('../../data/paraphrase/train_ref.txt', sep='\t', index=False, header=True)
ruen_dev[ruen_dev['preds'] == 1][['source', 'backtranslate']].to_csv('../../data/paraphrase/dev_ref.txt', sep='\t', index=False, header=True)
ruen_test[ruen_test['preds'] == 1][['source', 'backtranslate']].to_csv('../../data/paraphrase/test_ref.txt', sep='\t', index=False, header=True)


In [135]:
# sampled trainset to be paraphrased in inference phase (for train BART parallel model)
train_paraphrase_sample = ruen_train[ruen_train['preds'] == 1].sample(20000, random_state=42)[['source', 'backtranslate']]
train_paraphrase_sample.to_csv('../../data/paraphrase/train_ref_20k.txt', sep='\t', index=False, header=True)

In [136]:
len(train_paraphrase_sample)

20000

#### Export Individual Translation Data

In [13]:
# will be used for preparing parallel dataset
ruen_train_sel.to_csv('../data/ruen_train_sel.txt', sep='\t', index=True, header=True)
ruen_dev_sel.to_csv('../data/ruen_valid_sel.txt', sep='\t', index=True, header=True)
ruen_test_sel.to_csv('../data/ruen_test_sel.txt', sep='\t', index=True, header=True)
fren_train_sel.to_csv('../data/fren_train_sel.txt', sep='\t', index=True, header=True)
fren_dev_sel.to_csv('../data/fren_valid_sel.txt', sep='\t', index=True, header=True)
fren_test_sel.to_csv('../data/fren_test_sel.txt', sep='\t', index=True, header=True)
esen_train_sel.to_csv('../data/esen_train_sel.txt', sep='\t', index=True, header=True)
esen_dev_sel.to_csv('../data/esen_valid_sel.txt', sep='\t', index=True, header=True)
esen_test_sel.to_csv('../data/esen_test_sel.txt', sep='\t', index=True, header=True)