In [1]:
from google.colab import drive
drive.mount('/gdrive',force_remount=True)

Mounted at /gdrive


### Translation
This is the work on translation from english to korean using the pretrained model checkpoint by Jörg Tiedemann, professor of Department of Digital Hamanities

![Jörg Tiedemann](https://researchportal.helsinki.fi/files-asset/56125518/Tiedemann.png?w=160&f=webp)

#### MarianMT
Models were originally trained by Jörg Tiedemann using the Marian C++ library, which supports fast training and translation.

Since Marian models are smaller than many other translation models available in the library, they can be useful for fine-tuning experiments and integration tests.

#### Multilingual Models

- All model names use the following format: Helsinki-NLP/opus-mt-{src}-{tgt}:
- If a model can output multiple languages, and you should specify a language code by prepending the desired output language to the src_text.
- You can see a models’s supported language codes in its model card, under target constituents, like in opus-mt-en-roa.
- Note that if a model is only multilingual on the source side, like Helsinki-NLP/opus-mt-roa-en, no language codes are required.

In [2]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0


In [1]:
!pip install transformers datasets evaluate sentencepiece

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━

In [2]:
!pip install huggingface_hub



In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset
raw_ds = load_dataset("kde4",lang1="en",lang2="ko")

Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
split_datasets = raw_ds["train"].train_test_split(train_size=0.9,seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 69037
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 7671
    })
})

In [6]:
split_datasets["validation"] = split_datasets.pop("test")

In [None]:
split_datasets["train"][1]["translation"]

{'en': 'Please add the output filename (%f) to the command line.',
 'ko': '명령 라인에 출력될 파일 이름 (% f) 을( 를) 추가하십시오.'}

In [7]:
from transformers import AutoTokenizer
model_ckpt = "Helsinki-NLP/opus-mt-ko-en"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]



In [None]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
ko_sentence = split_datasets["train"][1]["translation"]["ko"]

inputs = tokenizer(ko_sentence, text_target=en_sentence)
inputs

{'input_ids': [15073, 16203, 296, 17223, 1700, 18914, 299, 21235, 1700, 17884, 25088, 23124, 17254, 17761, 15265, 17941, 300, 14338, 236, 301, 240, 27141, 21235, 1700, 15190, 20461, 19465, 25674, 29144, 245], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [19899, 14560, 18288, 14175, 10314, 9910, 29504, 14897, 14338, 236, 17254, 240, 15309, 239, 15715, 240, 14927, 13586, 20108, 245]}

In [None]:
wrong_targets = tokenizer(en_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁명령', '▁라', '인에', '▁출', '력', '될', '▁파일', '▁이름', '▁(', '%', '▁f', ')', '▁을', '(', '▁를', ')', '▁추가', '하', '십시오', '.']
['▁명령', '▁라', '인에', '▁출', '력', '될', '▁파일', '▁이름', '▁(', '%', '▁f', ')', '▁을', '(', '▁를', ')', '▁추가', '하', '십시오', '.']


In [8]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["ko"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [9]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/69037 [00:00<?, ? examples/s]

Map:   0%|          | 0/7671 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [None]:
batch["labels"]

tensor([[19899, 14560, 18288, 14175, 10314,  9910, 29504, 14897, 14338,   236,
         17254,   240, 15309,   239, 15715,   240, 14927, 13586, 20108,   245],
        [26755, 11973, 26052,   299, 23590, 15555,   315, 16203, 14338,   307,
         24508,   316, 19650, 20280,  -100,  -100,  -100,  -100,  -100,  -100]])

In [None]:
batch["decoder_input_ids"]

tensor([[    1, 19899, 14560, 18288, 14175, 10314,  9910, 29504, 14897, 14338,
           236, 17254,   240, 15309,   239, 15715,   240, 14927, 13586, 20108],
        [    1, 26755, 11973, 26052,   299, 23590, 15555,   315, 16203, 14338,
           307, 24508,   316, 19650, 20280,     3,     3,     3,     3,     3]])

In [None]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[19899, 14560, 18288, 14175, 10314, 9910, 29504, 14897, 14338, 236, 17254, 240, 15309, 239, 15715, 240, 14927, 13586, 20108, 245]
[26755, 11973, 26052, 299, 23590, 15555, 315, 16203, 14338, 307, 24508, 316, 19650, 20280]


In [12]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


In [13]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [14]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [15]:
from transformers import Seq2SeqTrainingArguments
batch_size = 64
args = Seq2SeqTrainingArguments(
    f"kd4_opus-mt-ko-en",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [16]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/chunwoolee0/kd4_opus-mt-ko-en into local empty directory.


In [17]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.916103720664978,
 'eval_bleu': 39.49937667454841,
 'eval_runtime': 630.9821,
 'eval_samples_per_second': 12.157,
 'eval_steps_per_second': 0.19}

In [18]:
trainer.train()



Step,Training Loss
500,1.8585
1000,1.7814
1500,1.7152
2000,1.6781
2500,1.5466
3000,1.4887
3500,1.5035
4000,1.4551
4500,1.4191
5000,1.3934


Adding files tracked by Git LFS: ['source.spm', 'target.spm']. This may take a bit of time if the files are large.


TrainOutput(global_step=6474, training_loss=1.532715692246148, metrics={'train_runtime': 1035.7775, 'train_samples_per_second': 199.957, 'train_steps_per_second': 6.25, 'total_flos': 2551308264603648.0, 'train_loss': 1.532715692246148, 'epoch': 3.0})

In [19]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.392415165901184,
 'eval_bleu': 32.11616746914562,
 'eval_runtime': 770.9612,
 'eval_samples_per_second': 9.95,
 'eval_steps_per_second': 0.156,
 'epoch': 3.0}

In [20]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

Upload file runs/Aug09_03-16-28_046c211e60de/events.out.tfevents.1691551636.046c211e60de.1101.0: 100%|########…

Upload file runs/Aug09_03-16-28_046c211e60de/events.out.tfevents.1691553444.046c211e60de.1101.1: 100%|########…

To https://huggingface.co/chunwoolee0/kd4_opus-mt-ko-en
   e1a7b81..c38eb39  main -> main

   e1a7b81..c38eb39  main -> main

To https://huggingface.co/chunwoolee0/kd4_opus-mt-ko-en
   c38eb39..f683442  main -> main

   c38eb39..f683442  main -> main



'https://huggingface.co/chunwoolee0/kd4_opus-mt-ko-en/commit/c38eb395d9cd33c4d02dbea390f210186f7626e7'

In [21]:
from transformers import pipeline

translator = pipeline("translation", model="chunwoolee0/kd4_opus-mt-ko-en")
translator("점심 식사 후에 산책가자.")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/258 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]



[{'translation_text': "Let's go for a walk after noon."}]

In [22]:
translator("이 강좌는 허깅페이스가 만든 거야.")

[{'translation_text': 'This is a course by Huggingspace.'}]

In [23]:
translator("오늘은 늦게 일어났다.")

[{'translation_text': "I'm up late today."}]