<a href="https://colab.research.google.com/github/gorkemozkaya/nmt-en-tr/blob/master/data_prep/blended_dataset_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Turkish-English blended dataset generation for tensorflow_datasets
We blend four different English-Turkish parallel corpora together and create train and dev dataset, later to be used with tensorflow_datasets python package.

In [None]:
!wget https://github.com/gorkemozkaya/nmt-en-tr/releases/download/bianet_ted_corpora/bianet_and_ted_corpora.zip
!wget https://github.com/gorkemozkaya/nmt-en-tr/releases/download/raw_data/nmt_en_tr_raw_data.zip
!unzip -n -qq bianet_and_ted_corpora.zip 
!unzip -n -qq nmt_en_tr_raw_data.zip

In [None]:
import numpy as np
from itertools import chain

def generate_lines(en_file_name, tr_file_name, frac_include, frac_dev):
  with open(en_file_name) as en_file,\
    open(tr_file_name) as tr_file:
    for en_line in en_file:
      tr_line = next(tr_file)
      rn = np.random.uniform()
      if rn > frac_include:
        continue
      rn = np.random.uniform()
      if rn  > frac_dev:
        yield (en_line, tr_line, "train")
      else:
        yield (en_line, tr_line, "dev")

In [None]:
lang_list = [['SETIMES2.en-tr.en', 'SETIMES2.en-tr.tr', 1.0, 0.03],
             ['OpenSubtitles.en-tr.en', 'OpenSubtitles.en-tr.tr', 0.1, 0.03 ],
             ['bianet_and_ted_corpora.txt/TED2013.en-tr.en', 'bianet_and_ted_corpora.txt/TED2013.en-tr.tr', 1.0, 0.03],
             ['bianet_and_ted_corpora.txt/bianet-entr-en.txt', 'bianet_and_ted_corpora.txt/bianet-entr-tr.txt', 1.0, 0.03]]

In [None]:
combined_generator = chain(*[generate_lines(*args) for args in lang_list])

In [None]:
with open("train.en", "w") as train_en, \
  open("train.tr", "w") as train_tr, \
  open("dev.en", "w") as dev_en, \
  open("dev.tr", "w") as dev_tr: 
  """
  """
  for line_en, line_tr, group in combined_generator:
    if group == 'dev':
      dev_en.write(line_en)
      dev_tr.write(line_tr)
    if group == 'train':
      train_en.write(line_en)
      train_tr.write(line_tr)