In [1]:
import os
os.environ["HF_HOME"] = r"./.cache"

## Downloads and Preprocess

In [2]:
from utils.dataset import JESC

JESC.info()
JESC.create_csv()
JESC.load()[0]

Webpage: https://nlp.stanford.edu/projects/jesc/
Paper  : https://arxiv.org/abs/1710.10639
Summary: Japanese-English Subtitle Corpus (2.8M sentences)
skipped: jesc.csv file already exists!


{'en_sentence': "you are back, aren't you, harold?",
 'ja_sentence': 'あなたは戻ったのね ハロルド?'}

In [3]:
from utils.dataset import WikiCorpus

WikiCorpus.info()
WikiCorpus.create_csv()
WikiCorpus.load()[0]

Webpage : https://github.com/venali/BilingualCorpus/
Summary : a large scale corpus of manually translated Japanese sentences
          extracted from Wikipedia's Kyoto Articles (~500k sentences)
skipped: wiki_corpus.csv file already exists!


{'en_sentence': 'Sesshu', 'ja_sentence': '雪舟'}

In [4]:
from utils.dataset import Tatoeba

Tatoeba.info()
Tatoeba.create_csv()
Tatoeba.load()[0]

Webpage    : https://opus.nlpl.eu/Tatoeba.php
Webpage(HF): https://huggingface.co/datasets/tatoeba
Summary    : a collection of sentences from https://tatoeba.org/en/, contains
             over 400 languages ([en-ja] 200k sentences)
skipped: tatoeba.csv file already exists!


{'en_sentence': "Let's try something.", 'ja_sentence': '何かしてみましょう。'}

In [5]:
from utils.dataset import SnowSimplified

SnowSimplified.info()
SnowSimplified.create_csv()
SnowSimplified.load()[0]

Webpage: https://huggingface.co/datasets/snow_simplified_japanese_corpus
Summary: Japanese-English sentence pairs, all Japanese sentences have
         a simplified counterpart (85k(x2) sentences)
skipped: snow_simplified.csv file already exists!


{'en_sentence': "i can 't tell who will arrive first .",
 'ja_sentence': '誰が一番に着くか私には分かりません。'}

In [6]:
from utils.dataset import MassiveTranslation

MassiveTranslation.info()
MassiveTranslation.create_csv()
MassiveTranslation.load()[0]

Webpage: https://huggingface.co/datasets/Amani27/massive_translation_dataset
Summary: dataset derived from AmazonScience/MASSIVE for translation
         (16k sentences in 10 languages)
skipped: massive_translation.csv file already exists!


{'en_sentence': 'wake me up at nine am on friday',
 'ja_sentence': '金曜日の午前九時に起こしてください'}

In [7]:
from utils.dataset import IWSLT2017

IWSLT2017.info()
IWSLT2017.create_csv()
IWSLT2017.load()[0]

Webpage    : https://sites.google.com/site/iwsltevaluation2017/TED-tasks
Webpage(HF): https://huggingface.co/datasets/iwslt2017
Summary    : a collection of multilingual tasks, one of which is a bilingual
             corpus of 230k [en-ja] sentences.
skipped: iwslt2017.csv file already exists!


{'en_sentence': "Thank you so much, Chris. And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.",
 'ja_sentence': 'どうもありがとう クリス このステージに立てる機会を 2度もいただけるというのは実に光栄なことで とてもうれしく思っています'}

In [8]:
from utils.dataset import OPUS100

OPUS100.info()
OPUS100.create_csv()
OPUS100.load()[0]

Webpage    : https://github.com/EdinburghNLP/opus-100-corpus
Webpage(HF): https://huggingface.co/datasets/opus100
Summary    : a multilingual corpus with 1M [en-ja] sentences,
             of various origins.
skipped: opus100.csv file already exists!


{'en_sentence': 'Yeah, Vincent Hanna.',
 'ja_sentence': '- ラウール - ラウールに ヴィンセント・ハンナだ'}

In [9]:
from utils.dataset import Flores

Flores.info()
Flores.create_csv()
Flores.load("dev")[0]

Webpage: https://github.com/facebookresearch/flores/tree/main/flores200
Paper  : https://arxiv.org/abs/2207.04672
Summary: Professional translation in over 200 languages, including
         en-ja, for evaluation tasks.
skipped: ('flores.dev.csv', 'flores.devtest.csv') file already exists!


{'ja_sentence': '月曜日にスタンフォード大学医学部の科学者たちは、細胞を種類別に分類できる新しい診断ツールを発明したと発表しました。それは標準的なインクジェットプリンタで印刷して製造できる小型チップであり、原価は1枚あたり1円ほどす。',
 'en_sentence': 'On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.'}

In [10]:
from utils.dataset import WMTvat

WMTvat.info()
WMTvat.create_csv()
WMTvat.load("en-ja")[0]

Webpage    : https://huggingface.co/datasets/gsarti/wmt_vat
Paper      : https://openreview.net/forum?id=hhKA5k0oVy5Summary    : A filtered version of WMT dataset increasing correlation with human
             judgement. Contains ja-en, en-ja professional translations for evaluation tasks
skipped: ('wmt_vat.en.ja.csv', 'wmt_vat.ja.en.csv') file already exists!


{'ja_sentence': 'Suzuki Nana, 2nd place in the ranking of disliked women, talks about her various gaffes, such as extremely rude body touching (The Television) - Yahoo!News',
 'en_sentence': '"嫌いな女ランキング"2位の鈴木奈々 、 " 失礼すぎるボディタッチ"などしくじりエピソードを告白（ザテレビジョン ） - Yahoo!ニュース'}

## Combining Datasets Examples

In [11]:
import os
os.environ["HF_HOME"] = r"./.cache"

In [12]:
from tokenizers import processors
from transformers import AutoTokenizer

source_lng = "ja"

if source_lng == "en": 
    target_lng = "ja"
    encoder = "bert-base-uncased"
    decoder = "rinna/japanese-gpt2-small" 
else: 
    target_lng = "en"
    encoder = "cl-tohoku/bert-base-japanese-v3"
    decoder = "gpt2"

encoder_tokenizer = AutoTokenizer.from_pretrained(encoder, use_fast=True)
decoder_tokenizer = AutoTokenizer.from_pretrained(decoder, use_fast=True)
if decoder_tokenizer.pad_token_id is None:
    decoder_tokenizer.pad_token_id = decoder_tokenizer.eos_token_id

# adds an EOS token at the end of each sentence
decoder_tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + decoder_tokenizer.eos_token,
    special_tokens=[(decoder_tokenizer.eos_token, decoder_tokenizer.eos_token_id)],
)

In [13]:
from utils.dataset import EnJaDatasetMaker, EnJaDatasetSample, SnowSimplified, MassiveTranslation

dataset = EnJaDatasetMaker.prepare_dataset(
    "ja-en-BERT-GPT2-test",
    [
        # lower is inclusive, upper is exclusive (0, 32) -> [0, 31]
        EnJaDatasetSample(SnowSimplified,      124, (0, 64)),
        EnJaDatasetSample(MassiveTranslation,   50, (0, 32)),
    ],
    source_language=source_lng,
    model_type="BERT-GPT2",
    encoder_tokenizer=encoder_tokenizer,
    decoder_tokenizer=decoder_tokenizer,
    num_proc=6,
    seed=42
)

skipped: loaded dataset with id="ja-en-BERT-GPT2-test" from existing cache.


In [14]:
dataset = EnJaDatasetMaker.load_dataset("ja-en-BERT-GPT2-test")
dataset

Dataset({
    features: ['target', 'source', 'length', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 174
})

In [15]:
dataset[0]

{'target': 'upon my word i will do it .',
 'source': '誓って私はそれをします。',
 'length': tensor(11),
 'input_ids': tensor([    2, 29062,   456,  4262,   465, 12546,   500,   441, 12995,   385,
             3]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([27287,   616,  1573,  1312,   481,   466,   340,   764, 50256])}

In [16]:
from transformers import MBart50TokenizerFast

source_lng = "en"

if source_lng == "en":
    target_lng = "ja"
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ja_XX")
else: 
    target_lng = "en"
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="ja_XX", tgt_lang="en_XX")

In [17]:
from utils.dataset import EnJaDatasetMaker, EnJaDatasetSample, SnowSimplified, MassiveTranslation

dataset = EnJaDatasetMaker.prepare_dataset(
    "en-ja-mBART-test",
    [
        # lower is inclusive, upper is exclusive (0, 32) -> [0, 31]
        EnJaDatasetSample(SnowSimplified,      124, (0, 64)),
        EnJaDatasetSample(MassiveTranslation,   50, (0, 32)),
    ],
    source_language=source_lng,
    model_type="mBART",
    tokenizer=tokenizer,
    num_proc=6,
    seed=42
)

skipped: loaded dataset with id="en-ja-mBART-test" from existing cache.


In [18]:
dataset = EnJaDatasetMaker.load_dataset("en-ja-mBART-test")
dataset

Dataset({
    features: ['source', 'target', 'length', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 174
})

In [19]:
dataset[0]

{'source': 'upon my word i will do it .',
 'target': '誓って私はそれをします。',
 'length': tensor(11),
 'input_ids': tensor([250004,  54799,    759,   2565,     17,   1221,     54,    442,      6,
              5,      2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([250012,      6, 111891,   2995,  25711,  37741,   5182,     30,      2])}

## Real Dataset

In [20]:
import os
os.environ["HF_HOME"] = r"./.cache"

from utils.dataset import EnJaDatasetMaker, EnJaDatasetSample, JESC, SnowSimplified, MassiveTranslation, Tatoeba, IWSLT2017, OPUS100

from transformers import MBart50TokenizerFast
def get_mBART_tokenizer(source_language):
    if source_language == "en":
        return MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ja_XX")
    else: # source_language == "ja":
        return MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="ja_XX", tgt_lang="en_XX")

In [21]:
SOURCE_LANGUAGE, TARGET_LANGUAGE = "en", "ja"
tokenizer = get_mBART_tokenizer(SOURCE_LANGUAGE)

dataset = EnJaDatasetMaker.prepare_dataset(
    f"{SOURCE_LANGUAGE}-{TARGET_LANGUAGE}-final",
    [
        EnJaDatasetSample(dataset=OPUS100, nsample=50_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=JESC, nsample=150_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=MassiveTranslation, nsample=20_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=SnowSimplified, nsample=30_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=Tatoeba, nsample=125_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=IWSLT2017, nsample=175_000, ntokens=(0, 128)),
    ],
    source_language = SOURCE_LANGUAGE,
    model_type= "mBART",
    tokenizer = tokenizer,
    num_proc  = 8,
    seed      = 42,
    splits    = (1, 0.002, 0.01) # rescaled to 1
)

skipped: loaded dataset with id="en-ja-final" from existing cache.


In [22]:
SOURCE_LANGUAGE, TARGET_LANGUAGE = "ja", "en"
tokenizer = get_mBART_tokenizer(SOURCE_LANGUAGE)

dataset = EnJaDatasetMaker.prepare_dataset(
    f"{SOURCE_LANGUAGE}-{TARGET_LANGUAGE}-final",
    [
        EnJaDatasetSample(dataset=OPUS100, nsample=50_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=JESC, nsample=150_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=MassiveTranslation, nsample=20_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=SnowSimplified, nsample=30_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=Tatoeba, nsample=125_000, ntokens=(0, 128)),
        EnJaDatasetSample(dataset=IWSLT2017, nsample=175_000, ntokens=(0, 128)),
    ],
    source_language = SOURCE_LANGUAGE,
    model_type= "mBART",
    tokenizer = tokenizer,
    num_proc  = 8,
    seed      = 42,
    splits    = (1, 0.002, 0.01) # rescaled to 1
)

skipped: loaded dataset with id="ja-en-final" from existing cache.
