In [1]:
import pandas as pd
import numpy as np



#### Load and Filter Paraphrase Data

In [2]:
train_pair = pd.read_csv('../data/paraphrase/output/train_gen_pair_sampled_eval.txt', sep='\t')
train20k_pair = pd.read_csv('../data/paraphrase/output/train_gen_pair_20k_eval.txt', sep='\t')
dev_pair = pd.read_csv('../data/paraphrase/output/val_gen_pair_eval.txt', sep='\t')
test_pair = pd.read_csv('../data/paraphrase/output/test_gen_pair_eval.txt', sep='\t')

In [4]:
train_pair_fil = train_pair[(~train_pair.gen.isna()) & \
                            (train_pair['preds'] == 0) & \
                            (train_pair.sem_similarity > 0.25)]
train20k_pair_fil = train20k_pair[(~train20k_pair.gen.isna()) & \
                                  (train20k_pair['preds'] == 0) & \
                                  (train20k_pair.sem_similarity > 0.25)]
dev_pair_fil = dev_pair[(~dev_pair.gen.isna()) & (dev_pair['preds'] == 0)]
test_pair_fil = test_pair[(~test_pair.gen.isna()) & (test_pair['preds'] == 0)]

In [5]:
train_pair_fil.rename(columns={'ori': 'source'}, inplace=True)
train20k_pair_fil.rename(columns={'ori': 'source'}, inplace=True)
dev_pair_fil.rename(columns={'ori': 'source'}, inplace=True)
test_pair_fil.rename(columns={'ori': 'source'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [6]:
train_pair_fil = train_pair_fil[['source', 'gen']]
train20k_pair_fil = train20k_pair_fil[['source', 'gen']]
dev_pair_fil = dev_pair_fil[['source', 'gen']]
test_pair_fil = test_pair_fil[['source', 'gen']]

In [7]:
# train_pair_fil['gen'] = train_pair_fil.gen.apply(lambda x: [x])
# train20k_pair_fil['gen'] = train20k_pair_fil.gen.apply(lambda x: [x])
dev_pair_fil['gen'] = dev_pair_fil.gen.apply(lambda x: [x])
test_pair_fil['gen'] = test_pair_fil.gen.apply(lambda x: [x])

#### Load Backtranslation Data

In [10]:
def make_reference(row):
    refs = [row['backtranslate']]
    if not pd.isnull(row['backtranslate_ru']):
        refs.append(row['backtranslate_ru'])
    if not pd.isnull(row['backtranslate_es']):
        refs.append(row['backtranslate_es'])
        
    return str(refs)

In [11]:
# load train data
import os

train_bt = None
for path in os.listdir('../data/backtranslation/'):
    if 'train_sel' in path:
        if train_bt is None:
            train_bt = pd.read_csv(f'../data/backtranslation/{path}', sep='\t', index_col=0)
        else:
            train_bt = pd.concat([train_bt,
                                 pd.read_csv(f'../data/backtranslation/{path}', sep='\t', index_col=0)])
        
        print(path, len(pd.read_csv(f'../data/backtranslation/{path}', sep='\t', index_col=0)))

train_bt.rename(columns={'backtranslate': 'gen'}, inplace=True)
train_bt = train_bt[['source', 'gen']]

esen_train_sel.txt 4625
ruen_train_sel.txt 7954
fren_train_sel.txt 9360


In [44]:
len(train_bt), 4625 + 7954 + 9360

(21939, 21939)

In [31]:
# working with validation data
dev_fr = pd.read_csv('../data/backtranslation/fren_dev_sel.txt', sep='\t', index_col=0)[['source', 'backtranslate']]
dev_ru = pd.read_csv('../data/backtranslation/ruen_dev_sel.txt', sep='\t', index_col=0)[['source', 'backtranslate']]
dev_es = pd.read_csv('../data/backtranslation/esen_dev_sel.txt', sep='\t', index_col=0)[['source', 'backtranslate']]

dev_frru = dev_fr.join(dev_ru, how='left', rsuffix='_ru')
dev_frrues = dev_frru.join(dev_es, how='left', rsuffix='_es')

dev_frrues['backtranslate'] = dev_frrues.apply(make_reference, axis=1)
dev_frrues = dev_frrues[['source', 'backtranslate']]

In [45]:
len(dev_fr), len(dev_ru), len(dev_es)

(509, 484, 295)

In [32]:
dev_ru_left = dev_ru.loc[[i for i in dev_ru.index if i not in dev_fr.index]]
dev_es_left = dev_es.loc[[i for i in dev_es.index if i not in dev_fr.index]]

dev_ru_left['backtranslate'] = dev_ru_left['backtranslate'].apply(lambda x: [x])
dev_es_left['backtranslate'] = dev_es_left['backtranslate'].apply(lambda x: [x])

dev_bt = pd.concat([dev_frrues, dev_ru_left, dev_es_left])
dev_bt.rename(columns={'backtranslate': 'gen'}, inplace=True)

In [33]:
# working with test data
test_fr = pd.read_csv('../data/backtranslation/fren_test_sel.txt', sep='\t', index_col=0)[['source', 'backtranslate']]
test_ru = pd.read_csv('../data/backtranslation/ruen_test_sel.txt', sep='\t', index_col=0)[['source', 'backtranslate']]
test_es = pd.read_csv('../data/backtranslation/esen_test_sel.txt', sep='\t', index_col=0)[['source', 'backtranslate']]

test_frru = test_fr.join(test_ru, how='left', rsuffix='_ru')
test_frrues = test_frru.join(test_es, how='left', rsuffix='_es')

test_frrues['backtranslate'] = test_frrues.apply(make_reference, axis=1)
test_frrues = test_frrues[['source', 'backtranslate']]

In [46]:
len(test_fr), len(test_ru), len(test_es)

(528, 385, 202)

In [34]:
test_ru_left = test_ru.loc[[i for i in test_ru.index if i not in test_fr.index]]
test_es_left = test_es.loc[[i for i in test_es.index if i not in test_fr.index]]

test_ru_left['backtranslate'] = test_ru_left['backtranslate'].apply(lambda x: [x])
test_es_left['backtranslate'] = test_es_left['backtranslate'].apply(lambda x: [x])

test_bt = pd.concat([test_frrues, test_ru_left, test_es_left])
test_bt.rename(columns={'backtranslate': 'gen'}, inplace=True)

#### Concat Dataset

In [35]:
train_all = pd.concat([train_pair_fil, train_bt])
train20k_all = pd.concat([train20k_pair_fil, train_bt])
dev_all = pd.concat([dev_pair_fil, dev_bt])
test_all = pd.concat([test_pair_fil, test_bt])

#### Export Dataset

In [36]:
train_all.to_csv('../data/parallel/train_pair_all.txt', sep='\t', index=False, header=True)
train20k_all.to_csv('../data/parallel/train20k_pair_all.txt', sep='\t', index=False, header=True)
dev_all.to_csv('../data/parallel/valid_pair_all.txt', sep='\t', index=False, header=True)
test_all.to_csv('../data/parallel/test_pair_all.txt', sep='\t', index=False, header=True)

In [13]:
# train_pair_fil.to_csv('../data/parallel/train_gpt_pair.txt', sep='\t', index=False, header=True)
train20k_pair_fil.to_csv('../data/parallel/train20k_gpt_pair.txt', sep='\t', index=False, header=True)
dev_pair_fil.to_csv('../data/parallel/valid_gpt_pair.txt', sep='\t', index=False, header=True)
test_pair_fil.to_csv('../data/parallel/test_gpt_pair.txt', sep='\t', index=False, header=True)

In [38]:
train_bt.to_csv('../data/parallel/train_bt_pair.txt', sep='\t', index=False, header=True)
dev_bt.to_csv('../data/parallel/valid_bt_pair.txt', sep='\t', index=False, header=True)
test_bt.to_csv('../data/parallel/test_bt_pair.txt', sep='\t', index=False, header=True)