<a href="https://colab.research.google.com/github/dotsnangles/en2ko-ko2en-translator-mT5-small/blob/master/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
!nvidia-smi

Mon Aug  1 15:14:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    36W /  70W |  11034MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Set notebook parameters

In [24]:
project_name = 'en2ko-translator-mt5-small'
run_name = 'baseline'

### Prerequisites

In [25]:
# !rm -rf /content/mt5-small-finetuned-en-to-ko_original
# !rm -rf /content/wandb

In [26]:
!pip install -q datasets transformers sentencepiece sacrebleu folium==0.2.1 wandb

In [27]:
# id = "1J21-T8wYjlj-91CxtxEzrcE34CDt7CM3"
# gdown.download_folder(id=id, quiet=True, use_cookies=False)
# !unzip -q /content/data/TS1.zip -d /content/data
# !unzip -q /content/data/VS1.zip -d /content/data

### Set WandB 

In [28]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [29]:
%env WANDB_PROJECT=$project_name
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all

env: WANDB_PROJECT=en2ko-translator-mt5-small
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


### Model Selection

In [30]:
model_ckpt = 'google/mt5-small'

### Import stuff

In [31]:
import json, gdown
import pandas as pd
from datasets import Dataset, load_metric
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer


In [32]:
train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/val.csv')

In [33]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=False)

### Measure token length

In [34]:
def measure_len(sample):
    return len(tokenizer.encode(sample))

In [35]:
src_prefix = "translate English to Korean: "

print('length of src_prefix:', measure_len(src_prefix))
print(tokenizer.encode(src_prefix))
with tokenizer.as_target_tokenizer():
    print(tokenizer.encode(src_prefix))

length of src_prefix: 7
[37194, 5413, 288, 259, 37209, 267, 1]
[37194, 5413, 288, 259, 37209, 267, 1]


In [36]:
train_df_en_len = train_df['en'].apply(measure_len)
train_df_ko_len = train_df['ko_original'].apply(measure_len)
val_df_en_len = val_df['en'].apply(measure_len)
val_df_ko_len = val_df['ko_original'].apply(measure_len)

In [37]:
max(train_df_en_len)+7, max(train_df_ko_len), max(val_df_en_len)+7, max(val_df_ko_len)

(117, 154, 99, 102)

### df to ds

In [38]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
train_ds, val_ds

(Dataset({
     features: ['en', 'ko_original'],
     num_rows: 79979
 }), Dataset({
     features: ['en', 'ko_original'],
     num_rows: 10131
 }))

In [39]:
idx = 0
for e in train_ds:
    print(e)
    idx += 1
    if idx == 2:
        break

{'en': 'The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.', 'ko_original': '비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.'}
{'en': 'The server 320 may input a source image to the analysis model DB 325 and receive object information output from the training model.', 'ko_original': '서버(320)는 분석 모델 DB(325)에 소스 영상을 입력하고, 학습 모델에서 출력하는 객체 정보를 수신할 수 있다.'}


### Preprocess

In [40]:
source_lang = "en"
target_lang = "ko_original"
prefix = "translate English to Korean: "


def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=160, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

### Test preprocess_function

In [41]:
train_ds[:3]

{'en': ['The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.',
  'The server 320 may input a source image to the analysis model DB 325 and receive object information output from the training model.',
  'The block shape and the split shape may be differently determined for each picture or slice, or differently determined for each largest coding unit.'],
 'ko_original': ['비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.',
  '서버(320)는 분석 모델 DB(325)에 소스 영상을 입력하고, 학습 모델에서 출력하는 객체 정보를 수신할 수 있다.',
  '블록 형태 및 분할 형태는 픽처 또는 슬라이스마다 상이하게 결정되거나, 각각의 최대 부호화 단위마다 상이하게 결정될 수도 있다.']}

In [42]:
preprocess_test = preprocess_function(train_ds[:3])
print('input id', preprocess_test.input_ids[0])
print(tokenizer.decode(preprocess_test.input_ids[0]), '\n')
print('attention mask', preprocess_test.attention_mask[0], '\n')
print('label', preprocess_test.labels[0])
print(tokenizer.decode(preprocess_test.labels[0]))

input id [37194, 5413, 288, 259, 37209, 267, 486, 39959, 19002, 259, 175510, 305, 259, 162249, 1432, 390, 17385, 345, 527, 259, 262, 788, 1696, 259, 97359, 2835, 631, 259, 262, 2002, 1233, 525, 2835, 259, 18775, 288, 259, 262, 36577, 260, 1]
translate English to Korean: The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.</s> 

attention mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

label [259, 53789, 1622, 312, 175510, 259, 5593, 644, 101294, 988, 30957, 118645, 259, 18490, 788, 1696, 63019, 3353, 12482, 2277, 1235, 49303, 125462, 1566, 3083, 19023, 261, 6463, 11051, 6763, 63362, 15331, 2277, 1235, 49303, 125462, 259, 44830, 3632, 260, 1]
비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.</s>


In [43]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_val = val_ds.map(preprocess_function, batched=True)
tokenized_train, tokenized_val

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

(Dataset({
     features: ['en', 'ko_original', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 79979
 }), Dataset({
     features: ['en', 'ko_original', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 10131
 }))

### Load metric

In [44]:
metric = load_metric("sacrebleu")
fake_preds = ["hello there", "general kenobi"]
fake_labels = [["hello there"], ["general kenobi"]]
metric.compute(predictions=fake_preds, references=fake_labels)

{'bp': 1.0,
 'counts': [4, 2, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'ref_len': 4,
 'score': 0.0,
 'sys_len': 4,
 'totals': [4, 2, 0, 0]}

### Check and Load model

In [45]:
config = AutoConfig.from_pretrained(model_ckpt)
config

MT5Config {
  "_name_or_path": "google/mt5-small",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.21.0",
  "use_cache": true,
  "vocab_size": 250112
}

In [46]:
model = AutoModelForSeq2SeqLM.from_config(config)
model

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo

In [None]:
# output_dir = 
# overwrite_output_dir = 
# do_train = 
# do_eval = 
# do_predict = 
# evaluation_strategy = 
# prediction_loss_only = 
# per_device_train_batch_size = 
# per_device_eval_batch_size = 
# per_gpu_train_batch_size = 
# per_gpu_eval_batch_size = 
# gradient_accumulation_steps = 
# eval_accumulation_steps = 
# eval_delay = 
# learning_rate = 
# weight_decay = 
# adam_beta1 = 
# adam_beta2 = 
# adam_epsilon = 
# max_grad_norm = 
# num_train_epochs = 
# max_steps = 
# lr_scheduler_type = 
# warmup_ratio = 
# warmup_steps = 
# log_level = 
# log_level_replica = 
# log_on_each_node = 
# logging_dir = 
# logging_strategy = 
# logging_first_step = 
# logging_steps = 
# logging_nan_inf_filter = 
# save_strategy = 
# save_steps = 
# save_total_limit = 
# save_on_each_node = 
# no_cuda = 
# seed = 
# data_seed = 
# jit_mode_eval = 
# use_ipex = 
# bf16 = 
# fp16 = 
# fp16_opt_level = 
# half_precision_backend = 
# bf16_full_eval = 
# fp16_full_eval = 
# tf32 = 
# local_rank = 
# xpu_backend = 
# tpu_num_cores = 
# tpu_metrics_debug = 
# debug = 
# dataloader_drop_last = 
# eval_steps = 
# dataloader_num_workers = 
# past_index = 
# run_name = 
# disable_tqdm = 
# remove_unused_columns = 
# label_names = 
# load_best_model_at_end = 
# metric_for_best_model = 
# greater_is_better = 
# ignore_data_skip = 
# sharded_ddp = 
# fsdp = 
# fsdp_min_num_params = 
# fsdp_transformer_layer_cls_to_wrap = 
# deepspeed = 
# label_smoothing_factor = 
# optim = 
# adafactor = 
# group_by_length = 
# length_column_name = 
# report_to = 
# ddp_find_unused_parameters = 
# ddp_bucket_cap_mb = 
# dataloader_pin_memory = 
# skip_memory_metrics = 
# use_legacy_prediction_loop = 
# push_to_hub = 
# resume_from_checkpoint = 
# hub_model_id = 
# hub_strategy = 
# hub_token = 
# hub_private_repo = 
# gradient_checkpointing = 
# include_inputs_for_metrics = 
# fp16_backend = 
# push_to_hub_model_id = 
# push_to_hub_organization = 
# push_to_hub_token = 
# mp_parameters = 
# auto_find_batch_size = 
# full_determinism = 
# torchdynamo = 
# ray_scope = 
# sortish_sampler = 
# predict_with_generate = 
# generation_max_length = 
# generation_num_beams = 

In [47]:
batch_size = 8
model_name = model_ckpt.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    report_to='wandb',
    run_name=run_name
)

In [48]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [49]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [50]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [51]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: ko_original, en. If ko_original, en are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 79979
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9998
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,22.9561,17.465996,1.0082,18.9956


Saving model checkpoint to mt5-small-finetuned-en-to-ko_original/checkpoint-500
Configuration saved in mt5-small-finetuned-en-to-ko_original/checkpoint-500/config.json
Model weights saved in mt5-small-finetuned-en-to-ko_original/checkpoint-500/pytorch_model.bin
tokenizer config file saved in mt5-small-finetuned-en-to-ko_original/checkpoint-500/tokenizer_config.json
Special tokens file saved in mt5-small-finetuned-en-to-ko_original/checkpoint-500/special_tokens_map.json
Saving model checkpoint to mt5-small-finetuned-en-to-ko_original/checkpoint-1000
Configuration saved in mt5-small-finetuned-en-to-ko_original/checkpoint-1000/config.json
Model weights saved in mt5-small-finetuned-en-to-ko_original/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in mt5-small-finetuned-en-to-ko_original/checkpoint-1000/tokenizer_config.json
Special tokens file saved in mt5-small-finetuned-en-to-ko_original/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to mt5-small-finetuned-

VBox(children=(Label(value='1149.259 MB of 1149.259 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.…

0,1
eval/bleu,▁
eval/gen_len,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/learning_rate,██▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/loss,█▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
eval/bleu,1.0082
eval/gen_len,18.9956
eval/loss,17.466
eval/runtime,690.8411
eval/samples_per_second,14.665
eval/steps_per_second,1.834
train/epoch,1.0
train/global_step,9998.0
train/learning_rate,0.0
train/loss,22.9561
