In [1]:
import os

os.environ["HF_HOME"] = r"./.cache"
from transformers import EncoderDecoderModel, AutoTokenizer, GenerationConfig
from datasets import load_dataset

- Encoders
    - BERT_JA : `cl-tohoku/bert-base-japanese-v3`
    - BERT_EN : `bert-base-uncased`, `prajjwal1/bert-tiny`
- Decorders
    - GPT_JA : `rinna/japanese-gpt2-xsmall`
    - GPT_EN : `gpt2`

In [2]:
source_lng = "ja"
target_lng = "en"

if source_lng == "en":
    encoder = "bert-base-uncased"
    decoder = "rinna/japanese-gpt2-xsmall"
else: 
    encoder = "cl-tohoku/bert-base-japanese-v3"
    decoder = "gpt2"

model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder, decoder, encoder_add_pooling_layer=False
)
model.cuda()

encoder_tokenizer = AutoTokenizer.from_pretrained(encoder, use_fast=True)
decoder_tokenizer = AutoTokenizer.from_pretrained(decoder, use_fast=True)
decoder_tokenizer.pad_token_id = 0

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.3.crossattention.c_attn.weight', 'h.2.ln_cross_attn.weight', 'h.2.crossattention.c_attn.bias', 'h.3.ln_cross_attn.bias', 'h.5.crossattention.c_proj.bias', 'h.3.ln_cross_attn.weight', 'h.3.crossattention.c_proj.bias', 'h.6.crossattention.c_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.6.ln_cross_attn.bias', 'h.11.crossattention.q_attn.bias', 'h.11.ln_cross_attn.bias', 'h.8.crossattention.c_attn.weight', 'h.7.ln_cross_attn.weight', 'h.3.crossattention.q_attn.bias', 'h.4.crossattention.q_attn.weight', 'h.0.crossattention.q_attn.weight', 'h.6.crossattention.q_attn.weight', 'h.1.crossattention.q_attn.weight', 'h.5.crossattention.c_attn.bias', 'h.8.ln_cross_attn.bias', 'h.4.ln_cross_attn.weight', 'h.5.crossattention.q_attn.weight', 'h.0.crossattention.q_attn.bias', 'h.9.crossattention.c_attn.bias', 'h.0.ln_cross_attn.bias', 'h.10.crossattention.q_attn.bias', 'h.7.ln_cro

In [3]:
print(model)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32768, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [4]:
model.config

EncoderDecoderConfig {
  "_commit_hash": null,
  "decoder": {
    "_name_or_path": "gpt2",
    "activation_function": "gelu_new",
    "add_cross_attention": true,
    "architectures": [
      "GPT2LMHeadModel"
    ],
    "attn_pdrop": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 50256,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "embd_pdrop": 0.1,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 50256,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_range": 0.02,
    "is_decoder": true,
    "is_encoder_decoder": false,
    "label2id": {
      "LABEL_0": 0,
      "LABEL_1": 1
    },
    "layer_norm_epsilon": 1

In [5]:
def print_model_parameters():
    print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")

    c_attn_pars = 0
    for layer in model.decoder.transformer.h:
        c_attn_pars += sum(p.numel() for p in layer.crossattention.parameters())
        c_attn_pars += sum(p.numel() for p in layer.ln_cross_attn.parameters())

    print(f"Number of cross-attention parameters: {c_attn_pars}")


print_model_parameters()

Number of parameters: 263423232
Number of cross-attention parameters: 28366848


In [6]:
def print_model_size():
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print("model size: {:.1f}MB".format(size_all_mb))


print_model_size()

model size: 1028.9MB


In [7]:
dataset = load_dataset("csv", data_files=r"./data-csv/snow_simplified.csv")
data_sample = dataset["train"]
data_sample = data_sample.select(range(4))

In [8]:
data_sample

Dataset({
    features: ['en_sentence', 'ja_sentence'],
    num_rows: 4
})

In [9]:
data_sample.data

MemoryMappedTable
en_sentence: string
ja_sentence: string
----
en_sentence: [["i can 't tell who will arrive first .","i can 't tell who will arrive first .","many animals have been destroyed by men .","many animals have been destroyed by men ."]]
ja_sentence: [["誰が一番に着くか私には分かりません。","誰が一番に着くか私には分かりません。","多くの動物が人間によって滅ぼされた。","多くの動物が人間によって殺された。"]]

In [10]:
def preprocess_data(batch):
    inputs = encoder_tokenizer(
    batch[f"{source_lng}_sentence"],
    padding="max_length",
    max_length=512,
    truncation=True,
    return_tensors="pt",
    )

    labels = decoder_tokenizer(
        batch[f"{target_lng}_sentence"],
        padding="max_length",
        max_length=128,
        truncation=True,
        return_tensors="pt",
    ).input_ids

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = labels
    batch["labels"][batch["labels"]==decoder_tokenizer.pad_token_id] = -100
    return batch

In [11]:
train_data = data_sample.map(preprocess_data, batched=True, remove_columns=["en_sentence", "ja_sentence"])

In [12]:
decoder_tokenizer.eos_token_id

50256

In [13]:
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [14]:
train_data["labels"]

tensor([[   72,   460,   705,    83,  1560,   508,   481,  9240,   717,   764,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

In [15]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [16]:
train_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    output_dir="./",
    num_train_epochs=15,
    logging_steps=1
)

In [17]:
trainer = Seq2SeqTrainer(model, args=train_args, train_dataset=train_data)

In [18]:
model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
# model.config.eos_token_id = decoder_tokenizer.eos_token_id
model.config.pad_token_id = decoder_tokenizer.pad_token_id

In [19]:
model.train()
trainer.train()



Step,Training Loss
1,5.8352
2,5.0046
3,3.786
4,2.9636
5,2.3338
6,1.694
7,1.7467
8,1.5744
9,1.38
10,1.2128


TrainOutput(global_step=15, training_loss=2.008200122912725, metrics={'train_runtime': 36.1549, 'train_samples_per_second': 1.66, 'train_steps_per_second': 0.415, 'total_flos': 36583621263360.0, 'train_loss': 2.008200122912725, 'epoch': 15.0})

In [20]:
gen_config = GenerationConfig()


def set_decoder_configuration(gen_config):
    # gen_config.no_repeat_ngram_size = 3
    gen_config.length_penalty = 2.0
    gen_config.num_beams = 4
    gen_config.max_new_tokens = 128
    gen_config.min_tokens = 0
    gen_config.early_stopping = True
    gen_config.bos_token_id = decoder_tokenizer.bos_token_id
    gen_config.eos_token_id = decoder_tokenizer.eos_token_id
    return gen_config


gen_config = set_decoder_configuration(gen_config)

In [21]:
model.cuda()
model.eval()
output = model.generate(
    train_data["input_ids"].cuda(),
    attention_mask=train_data["attention_mask"].cuda(),
    generation_config=gen_config
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [22]:
for i in range(4):
    print(output[i].size())

torch.Size([129])
torch.Size([129])
torch.Size([129])
torch.Size([129])


In [23]:
clean_output = decoder_tokenizer.batch_decode(output, skip_special_tokens=False)

In [24]:
clean_output

["<|endoftext|>i can 't tell who will arrive first.......................................................................................................................",
 "<|endoftext|>i can 't tell who will arrive first.......................................................................................................................",
 "<|endoftext|>i can't tell who will arrive first........................................................................................................................",
 "<|endoftext|>i can't tell who will arrive first........................................................................................................................"]