<a href="https://colab.research.google.com/github/dotsnangles/NMT-with-transformers/blob/master/training_mT5-small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Thu Aug  4 04:18:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A10G         On   | 00000000:00:1E.0 Off |                    0 |
|  0%   29C    P8    15W / 300W |      0MiB / 23028MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

### Set notebook parameters

In [2]:
run_name = 'en2ko run on aws ec2 with NVIDIA A10G'

project_name = 'en2ko-translator-mt5-small'

num_train_epochs = 100
batch_size = 16
gradient_accumulation_steps = 1

learning_rate = 2e-5
weight_decay = 0.01

lr_scheduler_type = 'cosine'
warmup_ratio = 0.1

predict_with_generate = False
generation_max_length = 256

# early_stopping_patience = 5
save_total_limit = 5

load_best_model_at_end = True
metric_for_best_model='eval_loss'

save_strategy = "epoch"
evaluation_strategy = "epoch"
# save_steps = 1250
# eval_steps = 1250

logging_strategy = "steps"
logging_first_step = True 
logging_steps = 500

fp16 = False

### Prerequisites

In [3]:
# !conda install -c conda-forge datasets transformers sentencepiece sacrebleu folium wandb pandas gdown jupyterlab ipywidgets

In [4]:
# import gdown
# id = "1J21-T8wYjlj-91CxtxEzrcE34CDt7CM3"
# gdown.download_folder(id=id, quiet=True, use_cookies=False)

### Set WandB 

In [5]:
%env WANDB_NOTEBOOK_NAME=/home/ubuntu/codes/NMT-with-transformers/training_mT5-small.ipynb
%env WANDB_PROJECT=$project_name
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all

env: WANDB_NOTEBOOK_NAME=/home/ubuntu/codes/NMT-with-transformers/training_mT5-small.ipynb
env: WANDB_PROJECT=en2ko-translator-mt5-small
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


In [6]:
import wandb
wandb.login()



[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Model Selection

In [7]:
model_ckpt = 'google/mt5-small'

### Import stuff

In [8]:
import pandas as pd
from datasets import Dataset, load_metric
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

In [9]:
train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/val.csv')

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=False)

### Measure token length

In [11]:
def measure_len(sample):
    return len(tokenizer.encode(sample))

In [12]:
src_prefix = "translate English to Korean: "

print('length of src_prefix:', measure_len(src_prefix))
print(tokenizer.encode(src_prefix))
with tokenizer.as_target_tokenizer():
    print(tokenizer.encode(src_prefix))

length of src_prefix: 7
[37194, 5413, 288, 259, 37209, 267, 1]
[37194, 5413, 288, 259, 37209, 267, 1]


In [13]:
train_df_en_len = train_df['en'].apply(measure_len)
train_df_ko_len = train_df['ko'].apply(measure_len)
val_df_en_len = val_df['en'].apply(measure_len)
val_df_ko_len = val_df['ko'].apply(measure_len)

In [14]:
max(train_df_en_len)+7, max(train_df_ko_len), max(val_df_en_len)+7, max(val_df_ko_len)

(117, 154, 99, 102)

### df to ds

In [15]:
train_ds = Dataset.from_pandas(train_df[['en', 'ko']])
val_ds = Dataset.from_pandas(val_df[['en', 'ko']])
# .shuffle(seed=42)[:val_ds_len]
# val_ds = Dataset.from_dict(val_ds)
train_ds, val_ds

(Dataset({
     features: ['en', 'ko'],
     num_rows: 79979
 }),
 Dataset({
     features: ['en', 'ko'],
     num_rows: 10131
 }))

In [16]:
idx = 0
for e in train_ds:
    print(e)
    idx += 1
    if idx == 2:
        break

{'en': 'The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.', 'ko': '비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.'}
{'en': 'The server 320 may input a source image to the analysis model DB 325 and receive object information output from the training model.', 'ko': '서버(320)는 분석 모델 DB(325)에 소스 영상을 입력하고, 학습 모델에서 출력하는 객체 정보를 수신할 수 있다.'}


### Preprocess

In [17]:
source_lang = "en"
target_lang = "ko"
prefix = "translate English to Korean: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=160, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#### Test preprocess_function

In [18]:
train_ds[:3]

{'en': ['The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.',
  'The server 320 may input a source image to the analysis model DB 325 and receive object information output from the training model.',
  'The block shape and the split shape may be differently determined for each picture or slice, or differently determined for each largest coding unit.'],
 'ko': ['비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.',
  '서버(320)는 분석 모델 DB(325)에 소스 영상을 입력하고, 학습 모델에서 출력하는 객체 정보를 수신할 수 있다.',
  '블록 형태 및 분할 형태는 픽처 또는 슬라이스마다 상이하게 결정되거나, 각각의 최대 부호화 단위마다 상이하게 결정될 수도 있다.']}

In [19]:
preprocess_test = preprocess_function(train_ds[:3])
print('input id', preprocess_test.input_ids[0])
print(tokenizer.decode(preprocess_test.input_ids[0]), '\n')
print('attention mask', preprocess_test.attention_mask[0], '\n')
print('label', preprocess_test.labels[0])
print(tokenizer.decode(preprocess_test.labels[0]))

input id [37194, 5413, 288, 259, 37209, 267, 486, 39959, 19002, 259, 175510, 305, 259, 162249, 1432, 390, 17385, 345, 527, 259, 262, 788, 1696, 259, 97359, 2835, 631, 259, 262, 2002, 1233, 525, 2835, 259, 18775, 288, 259, 262, 36577, 260, 1]
translate English to Korean: The comparators 1235 and 1237 may be expressed as a Relu activation function or a sigmoid function according to a setting.</s> 

attention mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

label [259, 53789, 1622, 312, 175510, 259, 5593, 644, 101294, 988, 30957, 118645, 259, 18490, 788, 1696, 63019, 3353, 12482, 2277, 1235, 49303, 125462, 1566, 3083, 19023, 261, 6463, 11051, 6763, 63362, 15331, 2277, 1235, 49303, 125462, 259, 44830, 3632, 260, 1]
비교기(1235 및 1237)는 설정에 따라 Relu 활성함수로 나타낼 수 있으며, 시그모이드 함수로 나타낼 수도 있다.</s>


In [None]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_val = val_ds.map(preprocess_function, batched=True)
tokenized_train, tokenized_val

### Load metric

In [None]:
metric = load_metric("sacrebleu")

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Check and Load model

In [None]:
config = AutoConfig.from_pretrained(model_ckpt)

In [None]:
model = AutoModelForSeq2SeqLM.from_config(config)

In [None]:
model_name = model_ckpt.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    report_to='wandb',
    run_name=run_name,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    learning_rate=learning_rate,
    weight_decay=weight_decay,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    # predict_with_generate=predict_with_generate,
    # generation_max_length=generation_max_length,

    save_total_limit=save_total_limit,

    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,
    # save_steps=save_steps,
    # eval_steps=eval_steps,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    # callbacks=[es],
)

In [None]:
trainer.train()
wandb.finish()

In [None]:
trainer.save_model('./save_model')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./save_model')

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained('./save_model')

In [None]:
args = Seq2SeqTrainingArguments(
    'eval',
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=256,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    # train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()