#English-Japanese Translation Model

## 1. Installing necessary packages

In [1]:
!pip install datasets transformers sentencepiece
!pip install evaluate sacrebleu
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installin

## 2. Preparing dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("snow_simplified_japanese_corpus", "snow_t15")

Downloading builder script:   0%|          | 0.00/6.95k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.50k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'original_ja', 'simplified_ja', 'original_en'],
        num_rows: 50000
    })
})

### Split this dataset into a train & a test set

In [4]:
dataset = dataset['train'].train_test_split(0.2)

In [5]:
dataset['train'][0]

{'ID': '45204',
 'original_ja': '彼は何をすべきか当惑した。',
 'simplified_ja': '彼は何をするべきか困った。',
 'original_en': 'he was at a loss what to do .'}

## 2. Tokenizing

Here, I will try to use the tokenizer and pretrained model from MBart large

In [6]:
from transformers import MBart50TokenizerFast

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ja_XX")

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

I will create a function to map so that we can tokenize our whole dataset.

In [7]:
def tokenize_data(batch):
  inputs =[ex for ex in batch['original_en']]
  targets = [ex for ex in batch['simplified_ja']]
  model_inputs = tokenizer(inputs, text_target=targets, max_length = 128, truncation=True)
  return model_inputs

In [8]:
tokenized_dataset = dataset.map(tokenize_data, batched=True, remove_columns=['original_ja','ID'])

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['simplified_ja', 'original_en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['simplified_ja', 'original_en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
})

In [10]:
tokenized_dataset['train'][20]

{'simplified_ja': '花はもう最も元気な状態を過ぎている。',
 'original_en': 'the flowers are already out of bloom .',
 'input_ids': [250004, 70, 189067, 621, 21771, 1810, 111, 29695, 306, 6, 5, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [250012,
  6,
  2603,
  342,
  20333,
  85147,
  112861,
  1308,
  37157,
  251,
  56613,
  7826,
  30,
  2]}

### 3. Creating a performance benchmark

In [11]:
class PerformanceBenchmark:
    def __init__(self, pipeline, dataset, optim_type="facebook/mbart-large-50"):
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type

    def compute_accuracy(self):
        # We'll define this later
        pass

    def compute_size(self):
        # We'll define this later
        pass

    def time_pipeline(self):
        # We'll define this later
        pass

    def run_benchmark(self):
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_accuracy())
        return metrics

In [12]:
import numpy as np
from time import perf_counter

def time_pipeline(self, query="What is the pin number for my account?"):
    """This overrides the PerformanceBenchmark.time_pipeline() method"""
    latencies = []
    # Warmup
    for _ in range(10):
        _ = self.pipeline(query)
    # Timed run
    for _ in range(100):
        start_time = perf_counter()
        _ = self.pipeline(query)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
    return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

PerformanceBenchmark.time_pipeline = time_pipeline

In [13]:
import torch
from pathlib import Path

def compute_size(self):
    """This overrides the PerformanceBenchmark.compute_size() method"""
    state_dict = self.pipeline.model.state_dict()
    tmp_path = Path("model.pt")
    torch.save(state_dict, tmp_path)
    # Calculate size in megabytes
    size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
    # Delete temporary file
    tmp_path.unlink()
    print(f"Model size (MB) - {size_mb:.2f}")
    return {"size_mb": size_mb}

PerformanceBenchmark.compute_size = compute_size

In [14]:
import evaluate
#hide_output
sacrebleu = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [15]:
def compute_accuracy(self):
    """This overrides the PerformanceBenchmark.compute_accuracy() method"""
    preds, labels = [], []
    for example in self.dataset:
        pred = self.pipeline(example["original_en"])['translation_text']
        label = example["labels"]
    results = sacrebleu.compute(predictions=preds, references=labels)
    print(f"Sacrebleu score on test set - {results['score']:.3f}")
    return results

PerformanceBenchmark.compute_accuracy = compute_accuracy

### 4. Training
We obtained our inputs as ids for input and label, now let's train it

First we will iniate our model from the pretrained mbart-large-50

In [16]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Use DataCollatorForSeq2Seq to create a batch of examples. It will also dynamically pad your text and labels to the length of the longest element in its batch, so they are a uniform length.

In [17]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [18]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

model_name = "EN-JA_Translation_with_MBart"
training_args = Seq2SeqTrainingArguments(
    output_dir=model_name,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    disable_tqdm=False,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [19]:
trainer.train()

You're using a MBart50TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.0959,1.037365
2,0.8332,0.986975


TrainOutput(global_step=5000, training_loss=1.1169255920410157, metrics={'train_runtime': 2013.2555, 'train_samples_per_second': 39.737, 'train_steps_per_second': 2.484, 'total_flos': 2704239336357888.0, 'train_loss': 1.1169255920410157, 'epoch': 2.0})

In [20]:
trainer.save_model("./translation-output")
tokenizer.save_pretrained("./translation-output")

('./translation-output/tokenizer_config.json',
 './translation-output/special_tokens_map.json',
 './translation-output/sentencepiece.bpe.model',
 './translation-output/added_tokens.json',
 './translation-output/tokenizer.json')

### 5. Testing model

In [23]:
from transformers import pipeline

translator = pipeline("translation", "./translation-output")

In [26]:
translator('I love cat and dog', src_lang="en_XX", tgt_lang="ja_XX")

[{'translation_text': '私は猫と犬が好きです。'}]