# Data collection

## Set-up

In [None]:
DATA_DIR = "data"

In [None]:
try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive')
    FULL_DATA_DIR = f'/content/drive/My Drive/mbr-reranking/{DATA_DIR}'

    IN_COLAB = True
except:
    FULL_DATA_DIR = DATA_DIR

    IN_COLAB = False

In [None]:
!pip install mtdata
!pip install --upgrade requests

## Download the dataset

Download the recipe:

In [None]:
!wget https://www.statmt.org/wmt22/mtdata/mtdata.recipes.wmt22-constrained.yml

Modify the recipe to not include all sources (some are very large):

In [None]:
import yaml

recipe_file = 'mtdata.recipes.wmt22-constrained.yml'

with open(recipe_file, 'r') as file:
    recipes = yaml.safe_load(file)

good_recipe = yaml.safe_load('''
dev:
  - Statmt-newstest_deen-2020-deu-eng
  - Statmt-newstest_ende-2020-eng-deu
test:
  - Statmt-newstest_deen-2021-deu-eng
  - Statmt-newstest_ende-2021-eng-deu
train:
  #- Statmt-europarl-10-deu-eng
  #- ParaCrawl-paracrawl-9-eng-deu
  # - Statmt-commoncrawl_wmt13-1-deu-eng
  - Statmt-news_commentary-16-deu-eng
  #- Statmt-wikititles-3-deu-eng
  #- Facebook-wikimatrix-1-deu-eng
  #- Tilde-eesc-2017-deu-eng
  # - Tilde-ema-2016-deu-eng
  - Tilde-airbaltic-1-deu-eng
  - Tilde-czechtourism-1-deu-eng
  - Tilde-ecb-2017-deu-eng
  #- Tilde-rapid-2016-deu-eng
  - Tilde-rapid-2019-deu-eng
''')

for recipe in recipes:
    if recipe['id'] != 'wmt22-deen':
        continue
    for split in good_recipe:
        recipe[split] = good_recipe[split]

with open(recipe_file, 'w') as file:
    yaml.safe_dump(recipes, file)

Download the dataset:

In [None]:
%%time
%%sh
mtdata -pb get-recipe -ri wmt22-deen -o wmt22-deen

Copy into the data path:

In [None]:
!mkdir -p "{FULL_DATA_DIR}"
!cp wmt22-deen/train.deu "{FULL_DATA_DIR}"
!cp wmt22-deen/train.eng "{FULL_DATA_DIR}"
!cp wmt22-deen/dev.deu "{FULL_DATA_DIR}/test.deu"
!cp wmt22-deen/dev.eng "{FULL_DATA_DIR}/test.eng"

## Create the different splits

In [None]:
train_sentences_src = []
train_sentences_dst = []
with open(f"{FULL_DATA_DIR}/train.eng", 'r', encoding="utf8") as fp:
    for line in fp:
        train_sentences_src.append(line.strip())

with open(f"{FULL_DATA_DIR}/train.deu", 'r', encoding="utf8") as fp:
    for line in fp:
        train_sentences_dst.append(line.strip())

assert len(train_sentences_src) == len(train_sentences_dst)

In [None]:
# Subsample training data

import random
random.seed(0)

num_samples = 300_000
num_samples = min(num_samples, len(train_sentences_src))

indices = list(range(len(train_sentences_src)))
print(len(indices))
sampled_indices = random.sample(indices, num_samples)

new_train_sentences_src = [train_sentences_src[i] for i in sampled_indices]
new_train_sentences_dst = [train_sentences_dst[i] for i in sampled_indices]

# Subsample dev data

num_dev_samples = 2_000
num_samples = min(num_dev_samples, len(train_sentences_src)-num_samples)

sampled_indices = set(sampled_indices)
indices = [i for i in range(len(train_sentences_src)) if i not in sampled_indices]
print(len(indices))
sampled_indices = random.sample(indices, num_samples)

new_dev_sentences_src = [train_sentences_src[i] for i in sampled_indices]
new_dev_sentences_dst = [train_sentences_dst[i] for i in sampled_indices]

In [None]:
with open(f'{FULL_DATA_DIR}/train.eng', 'w', encoding="utf8") as file:
    for sentence in new_train_sentences_src:
        file.write(sentence + '\n')

with open(f'{FULL_DATA_DIR}/train.deu', 'w', encoding="utf8") as file:
    for sentence in new_train_sentences_dst:
        file.write(sentence + '\n')


with open(f'{FULL_DATA_DIR}/dev.eng', 'w', encoding="utf8") as file:
    for sentence in new_dev_sentences_src:
        file.write(sentence + '\n')

with open(f'{FULL_DATA_DIR}/dev.deu', 'w', encoding="utf8") as file:
    for sentence in new_dev_sentences_dst:
        file.write(sentence + '\n')

## End

If in Google Colab, kill the session:

In [None]:
if IN_COLAB:
    import time
    time.sleep(15)

    from google.colab import runtime
    runtime.unassign()