<a href="https://colab.research.google.com/github/dotsnangles/NMT-with-transformers/blob/master/training_mT5-small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !rm -rf /content/mt5-small-finetuned-en-to-ko_original /content/wandb

In [1]:
!nvidia-smi

Wed Aug  3 02:27:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Set notebook parameters

In [2]:
run_name = 'test run 005'

project_name = 'en2ko-translator-mt5-small'

# val_ds_len = 256 # max 10131

num_train_epochs = 100
batch_size = 4
gradient_accumulation_steps = 4

learning_rate = 2e-5
weight_decay = 0.01

lr_scheduler_type = 'cosine'
warmup_ratio = 0.1

predict_with_generate = False
generation_max_length = 256

early_stopping_patience = 5
save_total_limit = 7

load_best_model_at_end = True
metric_for_best_model='eval_loss'

save_strategy = "steps"
evaluation_strategy = "steps"
save_steps = 1250
eval_steps = 1250

logging_strategy = "steps"
logging_first_step = True 
logging_steps = 500

fp16 = False

### Prerequisites

In [3]:
!pip install -q datasets transformers sentencepiece sacrebleu folium wandb pandas

[K     |████████████████████████████████| 365 kB 4.3 MB/s 
[K     |████████████████████████████████| 4.7 MB 69.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 49.7 MB/s 
[K     |████████████████████████████████| 116 kB 70.6 MB/s 
[K     |████████████████████████████████| 1.8 MB 44.7 MB/s 
[K     |████████████████████████████████| 101 kB 10.0 MB/s 
[K     |████████████████████████████████| 115 kB 71.8 MB/s 
[K     |████████████████████████████████| 212 kB 86.5 MB/s 
[K     |████████████████████████████████| 141 kB 90.9 MB/s 
[K     |████████████████████████████████| 596 kB 60.2 MB/s 
[K     |████████████████████████████████| 127 kB 92.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 66.4 MB/s 
[K     |████████████████████████████████| 156 kB 79.3 MB/s 
[K     |████████████████████████████████| 181 kB 99.2 MB/s 
[K     |████████████████████████████████| 63 kB 2.1 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [4]:
import gdown
id = "1J21-T8wYjlj-91CxtxEzrcE34CDt7CM3"
gdown.download_folder(id=id, quiet=True, use_cookies=False)
!unzip -q /content/data/TS1.zip -d /content/data
!unzip -q /content/data/VS1.zip -d /content/data

### Set WandB 

In [5]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
%env WANDB_PROJECT=$project_name
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all

env: WANDB_PROJECT=en2ko-translator-mt5-small
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


### Model Selection

In [5]:
model_ckpt = 'google/mt5-small'

### Import stuff

In [6]:
import json, gdown
import pandas as pd
from datasets import Dataset, load_metric
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback


In [7]:
train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/val.csv')

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=False)

Downloading tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

### Measure token length

In [9]:
def measure_len(sample):
    return len(tokenizer.encode(sample))

In [10]:
src_prefix = "translate English to Korean: "

print('length of src_prefix:', measure_len(src_prefix))
print(tokenizer.encode(src_prefix))
with tokenizer.as_target_tokenizer():
    print(tokenizer.encode(src_prefix))

length of src_prefix: 7
[37194, 5413, 288, 259, 37209, 267, 1]
[37194, 5413, 288, 259, 37209, 267, 1]


In [11]:
train_df_en_len = train_df['en'].apply(measure_len)
train_df_ko_len = train_df['ko_original'].apply(measure_len)
val_df_en_len = val_df['en'].apply(measure_len)
val_df_ko_len = val_df['ko_original'].apply(measure_len)

In [12]:
max(train_df_en_len)+7, max(train_df_ko_len), max(val_df_en_len)+7, max(val_df_ko_len)

(117, 154, 99, 102)

### df to ds

In [13]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
# .shuffle(seed=42)[:val_ds_len]
# val_ds = Dataset.from_dict(val_ds)
train_ds, val_ds

(Dataset({
     features: ['en', 'ko_original'],
     num_rows: 79979
 }), Dataset({
     features: ['en', 'ko_original'],
     num_rows: 10131
 }))

In [14]:
idx = 0
for e in train_ds:
    print(e)
    idx += 1
    if idx == 2:
        break

{'en': 'The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.', 'ko_original': '비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.'}
{'en': 'The server 320 may input a source image to the analysis model DB 325 and receive object information output from the training model.', 'ko_original': '서버(320)는 분석 모델 DB(325)에 소스 영상을 입력하고, 학습 모델에서 출력하는 객체 정보를 수신할 수 있다.'}


### Preprocess

In [15]:
source_lang = "en"
target_lang = "ko_original"
prefix = "translate English to Korean: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=160, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

### Test preprocess_function

In [16]:
train_ds[:3]

{'en': ['The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.',
  'The server 320 may input a source image to the analysis model DB 325 and receive object information output from the training model.',
  'The block shape and the split shape may be differently determined for each picture or slice, or differently determined for each largest coding unit.'],
 'ko_original': ['비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.',
  '서버(320)는 분석 모델 DB(325)에 소스 영상을 입력하고, 학습 모델에서 출력하는 객체 정보를 수신할 수 있다.',
  '블록 형태 및 분할 형태는 픽처 또는 슬라이스마다 상이하게 결정되거나, 각각의 최대 부호화 단위마다 상이하게 결정될 수도 있다.']}

In [17]:
preprocess_test = preprocess_function(train_ds[:3])
print('input id', preprocess_test.input_ids[0])
print(tokenizer.decode(preprocess_test.input_ids[0]), '\n')
print('attention mask', preprocess_test.attention_mask[0], '\n')
print('label', preprocess_test.labels[0])
print(tokenizer.decode(preprocess_test.labels[0]))

input id [37194, 5413, 288, 259, 37209, 267, 486, 39959, 19002, 259, 175510, 305, 259, 162249, 1432, 390, 17385, 345, 527, 259, 262, 788, 1696, 259, 97359, 2835, 631, 259, 262, 2002, 1233, 525, 2835, 259, 18775, 288, 259, 262, 36577, 260, 1]
translate English to Korean: The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.</s> 

attention mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

label [259, 53789, 1622, 312, 175510, 259, 5593, 644, 101294, 988, 30957, 118645, 259, 18490, 788, 1696, 63019, 3353, 12482, 2277, 1235, 49303, 125462, 1566, 3083, 19023, 261, 6463, 11051, 6763, 63362, 15331, 2277, 1235, 49303, 125462, 259, 44830, 3632, 260, 1]
비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.</s>


In [18]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_val = val_ds.map(preprocess_function, batched=True)
tokenized_train, tokenized_val

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

(Dataset({
     features: ['en', 'ko_original', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 79979
 }), Dataset({
     features: ['en', 'ko_original', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 10131
 }))

### Load metric

In [19]:
metric = load_metric("sacrebleu")
# fake_preds = ["hello there", "general kenobi"]
# fake_labels = [["hello there"], ["general kenobi"]]
# metric.compute(predictions=fake_preds, references=fake_labels)

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [20]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Check and Load model

In [21]:
config = AutoConfig.from_pretrained(model_ckpt)

In [22]:
model = AutoModelForSeq2SeqLM.from_config(config)

In [23]:
# Run Specifics
# run_name = 
# seed = 
# data_seed = 

# Dir
# report_to = 
# output_dir = 
# logging_dir = 
# overwrite_output_dir = 

# Hyper Parameters
# num_train_epochs = 
# per_device_train_batch_size = 
# per_device_eval_batch_size = 
# gradient_accumulation_steps = 
# learning_rate = 
# weight_decay = 
# adam_beta1 = 
# adam_beta2 = 
# adam_epsilon = 
# max_grad_norm = 
# lr_scheduler_type = 
# warmup_ratio = 
# warmup_steps = 
# optim = 
# adafactor = 

# Eval
# predict_with_generate = 
# generation_max_length = 
# generation_num_beams = 
# evaluation_strategy = 
# eval_delay = 
# eval_steps = 

# Logging
# logging_strategy = 
# logging_first_step = 
# logging_steps = 

# Archving
# save_strategy = 
# save_steps = 
# save_total_limit = 
# load_best_model_at_end = 
# metric_for_best_model = 

# etc.
# resume_from_checkpoint = 
# remove_unused_columns = 
# label_names = 
# group_by_length = 

# for System
# fp16 = 
# gradient_checkpointing = 



# not in use

# do_train = 
# do_eval = 
# do_predict = 
# prediction_loss_only = 

# half_precision_backend = 
# no_cuda = 
# jit_mode_eval = 
# use_ipex = 
# bf16 = 
# fp16_opt_level = 
# bf16_full_eval = 
# fp16_full_eval = 
# tf32 = 
# local_rank = 
# xpu_backend = 
# tpu_num_cores = 
# tpu_metrics_debug = 
# debug = 
# dataloader_drop_last = 

# dataloader_num_workers = 
# past_index = 
# disable_tqdm = 
# greater_is_better = 
# ignore_data_skip = 
# sharded_ddp = 
# fsdp = 
# fsdp_min_num_params = 
# fsdp_transformer_layer_cls_to_wrap = 
# deepspeed = 
# label_smoothing_factor = 

# length_column_name = 
# ddp_find_unused_parameters = 
# ddp_bucket_cap_mb = 
# dataloader_pin_memory = 
# skip_memory_metrics = 
# use_legacy_prediction_loop = 

# include_inputs_for_metrics = 
# fp16_backend = 
# mp_parameters = 
# auto_find_batch_size = 
# full_determinism = 
# torchdynamo = 
# ray_scope = 
# sortish_sampler = 

# per_gpu_train_batch_size = 
# per_gpu_eval_batch_size = 

# max_steps = 
# log_level = 
# log_level_replica = 
# log_on_each_node = 
# logging_nan_inf_filter = 
# save_on_each_node = 
# push_to_hub = 
# hub_model_id = 
# hub_strategy = 
# hub_token = 
# hub_private_repo = 
# push_to_hub_model_id = 
# push_to_hub_organization = 
# push_to_hub_token = 
# eval_accumulation_steps = 

In [24]:
model_name = model_ckpt.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    
    report_to='wandb',
    run_name=run_name,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    learning_rate=learning_rate,
    weight_decay=weight_decay,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    evaluation_strategy=evaluation_strategy,
    predict_with_generate=predict_with_generate,
    generation_max_length=generation_max_length,
    
    save_strategy=save_strategy,
    save_total_limit=save_total_limit,
    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,

    save_steps=save_steps,
    eval_steps=eval_steps,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [25]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
    callbacks=[es],
)

In [26]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: ko_original, en. If ko_original, en are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 79979
  Num Epochs = 100
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 499800
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [27]:
trainer.save_model('./save_model')

Saving model checkpoint to ./save_model
Configuration saved in ./save_model/config.json
Model weights saved in ./save_model/pytorch_model.bin
tokenizer config file saved in ./save_model/tokenizer_config.json
Special tokens file saved in ./save_model/special_tokens_map.json


In [28]:
tokenizer = AutoTokenizer.from_pretrained('./save_model')

Didn't find file ./save_model/tokenizer.json. We won't load it.
Didn't find file ./save_model/added_tokens.json. We won't load it.
loading file ./save_model/spiece.model
loading file None
loading file None
loading file ./save_model/special_tokens_map.json
loading file ./save_model/tokenizer_config.json
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [29]:
model = AutoModelForSeq2SeqLM.from_pretrained('./save_model')

loading configuration file ./save_model/config.json
Model config MT5Config {
  "_name_or_path": "./save_model",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.21.0",
  "use_cache": true,
  "vocab_size": 250112
}

loading weights file ./save_model/pytorch_model.bin
All model checkpoint weights were used when initializing MT5ForConditionalGeneration.

All the weights 

In [30]:
args = Seq2SeqTrainingArguments(
    'eval',
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=256,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [31]:
trainer = Seq2SeqTrainer(
    model,
    args,
    # train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [32]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: ko_original, en. If ko_original, en are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10131
  Batch size = 8


KeyboardInterrupt: ignored