## Parameter

In [None]:
# These parameters can be injected from Papermill
train_file = "data/jawiki/20210301/data/train.txt"
valid_file = "data/jawiki/20210301/data/valid.txt"
epochs = 10
batch_size = 2
max_learning_rate = 1e-4
warmup_steps = 0
clipnorm = 1.0
fp16 = False
model_dir = "output/model"
tensorboard_dir = f"output/tensorboard/{model_dir}-tensorboard"

In [None]:
! pip install transformers==4.3.3
! pip install git+https://github.com/colorfulscoop/tfdlg@v0.2.0

In [None]:
# Enable widgetsnbextention to avoid the following error when running GPT2.from_pretrained method
#     ImportError: IProgress not found. Please update jupyter and ipywidgets.
! jupyter nbextension enable --py widgetsnbextension

## Configure GPU

In [None]:
from tfdlg.utils import set_memory_growth
from tfdlg.utils import set_mixed_precision_policy

In [None]:
set_memory_growth()

In [None]:
if fp16:
    set_mixed_precision_policy()

## Setup tokenizer

In [None]:
import transformers

In [None]:
tokenizer = transformers.BertGenerationTokenizer.from_pretrained(model_dir)

In [None]:
len(tokenizer)

## Setup model config

In [None]:
from transformers import GPT2Config

# This config sets parameters as GPT2 small
model_config = GPT2Config(
    vocab_size=len(tokenizer),
    tokenizer_class="BertGenerationTokenizer",
    bos_token_id=tokenizer.bos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    sep_token_id=tokenizer.sep_token_id,
    cls_token_id=tokenizer.cls_token_id,
    unk_token_id=tokenizer.unk_token_id,
)

In [None]:
model_config

## Prepare Dataset

In [None]:
from pathlib import Path
from urllib.request import urlretrieve
import zipfile
import numpy as np


def read_file(_filepath):
    return (t.strip("\n") for t in open(_filepath))


In [None]:
from tfdlg.data import BlockDataset


train_dataset = BlockDataset.from_generator(
    generator=lambda: read_file(train_file),
    encode_fn=tokenizer.encode,
    block_size=model_config.n_ctx,
    batch_size=batch_size,
    shuffle=True
)
valid_dataset = BlockDataset.from_generator(
    generator=lambda: read_file(valid_file),
    encode_fn=tokenizer.encode,
    block_size=model_config.n_ctx,
    batch_size=batch_size,
    shuffle=False
)

In [None]:
#num_train_steps = sum(1 for _ in train_dataset)
#num_valid_steps = sum(1 for _ in valid_dataset)
num_train_steps = 254256
num_valid_steps = 6768
print("Train steps:", num_train_steps)
print("Valid steps:", num_valid_steps)

## Transformers model implementation

In [None]:
from transformers import TFGPT2LMHeadModel
import tensorflow.keras as keras
import tensorflow as tf

In [None]:
model = TFGPT2LMHeadModel(model_config)

## Prepare Model

In [None]:
from tfdlg.losses import PaddingLoss
from tfdlg.schedules import WarmupLinearDecay
import tensorflow.keras as keras


def train(
    _model,
    _train_dataset,
    _valid_dataset,
    _epochs,
    _warmup_steps,
    _num_train_steps,
    _max_learning_rate,
    _clipnorm,
    _tensorboard_dir
):
    schedule = WarmupLinearDecay(
        max_learning_rate=_max_learning_rate,
        warmup_steps=_warmup_steps,
        training_steps=_num_train_steps*_epochs
    )
    optimizer = keras.optimizers.Adam(
        schedule,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-8,
        clipnorm=_clipnorm
    )
    _model.compile(
        # Ignore the second output for LM logits
        loss=(PaddingLoss(), None),
        optimizer=optimizer
    )

    history = _model.fit(
        _train_dataset,
        validation_data=_valid_dataset,
        epochs=_epochs,
        callbacks=[
            keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True),
            keras.callbacks.TensorBoard(
                log_dir=tensorboard_dir,
                update_freq=100,
                profile_batch=0,
            )
        ],
        verbose=2,
    )


In [None]:
train(
    model,
    train_dataset,
    valid_dataset,
    epochs,
    warmup_steps,
    num_train_steps,
    max_learning_rate,
    clipnorm,
    tensorboard_dir
)

In [None]:
model.summary()

In [None]:
model.save_pretrained(model_dir)