In [2]:
import os

os.listdir("../data_parallel")

['synthetic',
 'gold',
 'fce',
 '1_billion_words',
 'wi+locness',
 'lang8',
 'nucle',
 '.ipynb_checkpoints',
 '1bw']

In [58]:
def read_lines(fn):
    if not os.path.exists(fn):
        return []
    with open(fn, 'r', encoding='utf-8') as f:
        text = f.read()
    lines = text.split("\n")
    if lines[-1] == '':
        return lines[:-1]
    else:
        return lines

def write_lines(fn, lines, mode='w'):
    text_to_write = "\n".join(lines)
#     if len(text_to_write) > 0:
#         text_to_write + "\n"
    with open(fn, encoding='utf-8', mode=mode) as f:
        f.write(text_to_write)

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Lang8

In [60]:
os.listdir("../data_parallel/lang8")

['lang8_tgt', 'lang8_src']

In [61]:
lang8_src = read_lines("../data_parallel/lang8/lang8_src")
lang8_tgt = read_lines("../data_parallel/lang8/lang8_tgt")

In [62]:
len(lang8_src)

1037561

In [63]:
(pd.Series(lang8_src) != pd.Series(lang8_tgt)).sum()

499205

In [140]:
(pd.Series(lang8_src) != pd.Series(lang8_tgt)).mean()

0.48113315747218716

In [64]:
def get_train_dev_splits(source, target, test_size=0.02, random_state=4, stratify=True):
    ser_source = pd.Series(source)
    ser_target = pd.Series(target)
    source_train, source_dev, target_train, target_dev = [],[],[],[]
    if stratify:
        have_changes = ser_source != ser_target
        source_train, source_dev, target_train, target_dev = train_test_split(ser_source, ser_target,\
                                            test_size=test_size, random_state=random_state, stratify = have_changes)
    else:
        source_train, source_dev, target_train, target_dev = train_test_split(ser_source, ser_target,\
                                            test_size=test_size, random_state=random_state)
    
    return source_train.values, source_dev.values, target_train.values, target_dev.values

In [66]:
source_train_lang8, source_dev_lang8, target_train_lang8, target_dev_lang8 = get_train_dev_splits(lang8_src, lang8_tgt)

### FCE

In [68]:
fce_train_src = read_lines("../data_parallel/fce/fce_train_src")
fce_train_tgt = read_lines("../data_parallel/fce/fce_train_tgt")

In [69]:
len(pd.Series(fce_train_src))

28350

In [70]:
(pd.Series(fce_train_src) != pd.Series(fce_train_tgt)).sum()

17742

In [71]:
fce_dev_src = read_lines("../data_parallel/fce/fce_dev_src")
fce_dev_tgt = read_lines("../data_parallel/fce/fce_dev_tgt")

fce_test_src = read_lines("../data_parallel/fce/fce_test_src")
fce_test_tgt = read_lines("../data_parallel/fce/fce_test_tgt")

In [72]:
all_fce_src = []
all_fce_src.extend(fce_train_src)
all_fce_src.extend(fce_dev_src)
all_fce_src.extend(fce_test_src)

all_fce_tgt = []
all_fce_tgt.extend(fce_train_tgt)
all_fce_tgt.extend(fce_dev_tgt)
all_fce_tgt.extend(fce_test_tgt)

In [141]:
len(all_fce_src)

33236

In [142]:
(pd.Series(all_fce_src) != pd.Series(all_fce_tgt)).sum()

20905

In [143]:
(pd.Series(all_fce_src) != pd.Series(all_fce_tgt)).mean()

0.6289866409916958

In [74]:
source_train_fce, source_dev_fce, target_train_fce, target_dev_fce = get_train_dev_splits(all_fce_src, all_fce_tgt)

### Nucle

In [75]:
nucle_src = read_lines("../data_parallel/nucle/nucle_src")
nucle_tgt = read_lines("../data_parallel/nucle/nucle_tgt")

In [76]:
(pd.Series(nucle_src) != pd.Series(nucle_tgt)).sum()

21834

In [144]:
(pd.Series(nucle_src) != pd.Series(nucle_tgt)).mean()

0.3820405592203111

In [77]:
len(pd.Series(nucle_src))

57151

In [78]:
source_train_nucle, source_dev_nucle, target_train_nucle, target_dev_nucle = get_train_dev_splits(nucle_src, nucle_tgt)

### WI+Locness

In [79]:
os.listdir("../data_parallel/wi+locness")

['dev_src', 'dev_tgt', 'train_src', 'train_tgt']

In [80]:
wl_src = read_lines("../data_parallel/wi+locness/train_src")
wl_tgt = read_lines("../data_parallel/wi+locness/train_tgt")

In [145]:
len(wl_src)

34308

In [148]:
1037561+33236+57151+34308

1162256

In [149]:
499205+20905+21834+22744

564688

In [150]:
564688/1162256

0.485855095607164

In [147]:
(pd.Series(wl_src) != pd.Series(wl_tgt)).sum()

22744

In [146]:
(pd.Series(wl_src) != pd.Series(wl_tgt)).mean()

0.6629357584236912

In [81]:
source_train_wl, source_dev_wl, target_train_wl, target_dev_wl = get_train_dev_splits(wl_src, wl_tgt)

In [139]:
write_lines("../data_parallel/wi+locness/train98_src", source_train_wl)
write_lines("../data_parallel/wi+locness/train98_tgt", target_train_wl)
write_lines("../data_parallel/wi+locness/dev02_src", source_dev_wl)
write_lines("../data_parallel/wi+locness/dev02_tgt", target_dev_wl)

### 1 Billion words

In [51]:
os.listdir("../data_parallel/1bw/")

['test_source', 'train_target', 'train_source', 'test_target']

In [82]:
syn_train_src = read_lines("../data_parallel/1bw/train_source")
syn_train_tgt = read_lines("../data_parallel/1bw/train_target")

syn_dev_src = read_lines("../data_parallel/1bw/test_source")
syn_dev_tgt = read_lines("../data_parallel/1bw/test_target")

In [94]:
len(syn_train_src)

2616302

### Generate gold

In [135]:
gold_train_src = []
gold_train_tgt = []
gold_dev_src = []
gold_dev_tgt = []

gold_train_src.extend(source_train_lang8)
gold_train_src.extend(source_train_fce)
gold_train_src.extend(source_train_nucle)
gold_train_src.extend(source_train_wl)

gold_train_tgt.extend(target_train_lang8)
gold_train_tgt.extend(target_train_fce)
gold_train_tgt.extend(target_train_nucle)
gold_train_tgt.extend(target_train_wl)

gold_dev_src.extend(source_dev_lang8)
gold_dev_src.extend(source_dev_fce)
gold_dev_src.extend(source_dev_nucle)
gold_dev_src.extend(source_dev_wl)

gold_dev_tgt.extend(target_dev_lang8)
gold_dev_tgt.extend(target_dev_fce)
gold_dev_tgt.extend(target_dev_nucle)
gold_dev_tgt.extend(target_dev_wl)

In [118]:
len(gold_train_src)

1139008

In [119]:
len(gold_dev_src)

23248

In [120]:
(pd.Series(gold_train_src) != pd.Series(gold_train_tgt)).sum()

553394

In [122]:
(pd.Series(gold_train_src) != pd.Series(gold_train_tgt)).mean()

0.48585611338989715

In [136]:
(pd.Series(gold_dev_src) != pd.Series(gold_dev_tgt)).sum()

11294

In [137]:
(pd.Series(gold_dev_src) != pd.Series(gold_dev_tgt)).mean()

0.4858052305574673

In [93]:
write_lines("../data_parallel/gold/train_src", gold_train_src)
write_lines("../data_parallel/gold/train_tgt", gold_train_tgt)
write_lines("../data_parallel/gold/dev_src", gold_dev_src)
write_lines("../data_parallel/gold/dev_tgt", gold_dev_tgt)

### Gold + 0.5 1BW

In [92]:
#try_gold_train_src = read_lines("../data_parallel/gold/train_src")

In [110]:
500000*0.02

10000.0

In [111]:
syn_train_src[:500000]

500000

In [127]:
gold_train_src = []
gold_train_tgt = []
gold_dev_src = []
gold_dev_tgt = []

gold_train_src.extend(syn_train_src[:500000])
gold_train_src.extend(source_train_lang8)
gold_train_src.extend(source_train_fce)
gold_train_src.extend(source_train_nucle)
gold_train_src.extend(source_train_wl)

gold_train_tgt.extend(syn_train_tgt[:500000])
gold_train_tgt.extend(target_train_lang8)
gold_train_tgt.extend(target_train_fce)
gold_train_tgt.extend(target_train_nucle)
gold_train_tgt.extend(target_train_wl)

gold_dev_src.extend(syn_dev_src[:10000])
gold_dev_src.extend(source_dev_lang8)
gold_dev_src.extend(source_dev_fce)
gold_dev_src.extend(source_dev_nucle)
gold_dev_src.extend(source_dev_wl)

gold_dev_tgt.extend(syn_dev_tgt[:10000])
gold_dev_tgt.extend(target_dev_lang8)
gold_dev_tgt.extend(target_dev_fce)
gold_dev_tgt.extend(target_dev_nucle)
gold_dev_tgt.extend(target_dev_wl)

In [128]:
write_lines("../data_parallel/gold+1bw/train_src", gold_train_src)
write_lines("../data_parallel/gold+1bw/train_tgt", gold_train_tgt)
write_lines("../data_parallel/gold+1bw/dev_src", gold_dev_src)
write_lines("../data_parallel/gold+1bw/dev_tgt", gold_dev_tgt)

In [129]:
len(gold_train_src)

1639008

In [130]:
len(gold_dev_src)

33248

In [131]:
(pd.Series(gold_train_src) != pd.Series(gold_train_tgt)).sum()

1053394

In [132]:
(pd.Series(gold_train_src) != pd.Series(gold_train_tgt)).mean()

0.6427021710693297

In [133]:
(pd.Series(gold_dev_src) != pd.Series(gold_dev_tgt)).sum()

21294

In [134]:
(pd.Series(gold_dev_src) != pd.Series(gold_dev_tgt)).mean()

0.6404595765158807