## Parameter

In [1]:
# These parameters can be injected from Papermill
train_file = "data/jawiki/20210301/data/train.txt"
valid_file = "data/jawiki/20210301/data/valid.txt"
epochs = 10
batch_size = 2
max_learning_rate = 1e-4
warmup_steps = 0
clipnorm = 1.0
fp16 = False
model_dir = "output/model"
tensorboard_dir = f"output/tensorboard/{model_dir}-tensorboard"

In [2]:
! pip install transformers==4.3.3
! pip install git+https://github.com/colorfulscoop/tfdlg@v0.2.0

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Collecting git+https://github.com/colorfulscoop/tfdlg@v0.2.0
  Cloning https://github.com/colorfulscoop/tfdlg (to revision v0.2.0) to /tmp/pip-req-build-j81q7ar4
Building wheels for collected packages: tfdlg


  Building wheel for tfdlg (setup.py) ... [?25ldone
[?25h  Created wheel for tfdlg: filename=tfdlg-0.1.0-py3-none-any.whl size=15985 sha256=09c4fc79cdd6c2d5199a94920d28ed22d817ae7525e4549aea89ba2ba842f77e
  Stored in directory: /tmp/pip-ephem-wheel-cache-n12n62mk/wheels/04/40/df/1a5f14da9695b730a70f064b87b27c3faa5a8674893745b37c
Successfully built tfdlg
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
# Enable widgetsnbextention to avoid the following error when running GPT2.from_pretrained method
#     ImportError: IProgress not found. Please update jupyter and ipywidgets.
! jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


## Configure GPU

In [4]:
from tfdlg.utils import set_memory_growth
from tfdlg.utils import set_mixed_precision_policy

In [5]:
set_memory_growth()

Set memory growth to PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [6]:
if fp16:
    set_mixed_precision_policy()

## Setup tokenizer

In [7]:
import transformers

In [8]:
tokenizer = transformers.BertGenerationTokenizer.from_pretrained(model_dir)

In [9]:
len(tokenizer)

32000

## Setup model config

In [10]:
from transformers import GPT2Config

# This config sets parameters as GPT2 small
model_config = GPT2Config(
    vocab_size=len(tokenizer),
    tokenizer_class="BertGenerationTokenizer",
    bos_token_id=tokenizer.bos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    sep_token_id=tokenizer.sep_token_id,
    cls_token_id=tokenizer.cls_token_id,
    unk_token_id=tokenizer.unk_token_id,
)

In [11]:
model_config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 2,
  "cls_token_id": 4,
  "embd_pdrop": 0.1,
  "eos_token_id": 3,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 0,
  "resid_pdrop": 0.1,
  "sep_token_id": 5,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "tokenizer_class": "BertGenerationTokenizer",
  "transformers_version": "4.3.3",
  "unk_token_id": 1,
  "use_cache": true,
  "vocab_size": 32000
}

## Prepare Dataset

In [12]:
from pathlib import Path
from urllib.request import urlretrieve
import zipfile
import numpy as np


def read_file(_filepath):
    return (t.strip("\n") for t in open(_filepath))


In [13]:
from tfdlg.data import BlockDataset


train_dataset = BlockDataset.from_generator(
    generator=lambda: read_file(train_file),
    encode_fn=tokenizer.encode,
    block_size=model_config.n_ctx,
    batch_size=batch_size,
    shuffle=True
)
valid_dataset = BlockDataset.from_generator(
    generator=lambda: read_file(valid_file),
    encode_fn=tokenizer.encode,
    block_size=model_config.n_ctx,
    batch_size=batch_size,
    shuffle=False
)

In [14]:
#num_train_steps = sum(1 for _ in train_dataset)
#num_valid_steps = sum(1 for _ in valid_dataset)
num_train_steps = 254256
num_valid_steps = 6768
print("Train steps:", num_train_steps)
print("Valid steps:", num_valid_steps)

Train steps: 254256
Valid steps: 6768


## Transformers model implementation

In [15]:
from transformers import TFGPT2LMHeadModel
import tensorflow.keras as keras
import tensorflow as tf

In [16]:
model = TFGPT2LMHeadModel(model_config)

## Prepare Model

In [17]:
from tfdlg.losses import PaddingLoss
from tfdlg.schedules import WarmupLinearDecay
import tensorflow.keras as keras


def train(
    _model,
    _train_dataset,
    _valid_dataset,
    _epochs,
    _warmup_steps,
    _num_train_steps,
    _max_learning_rate,
    _clipnorm,
    _tensorboard_dir
):
    schedule = WarmupLinearDecay(
        max_learning_rate=_max_learning_rate,
        warmup_steps=_warmup_steps,
        training_steps=_num_train_steps*_epochs
    )
    optimizer = keras.optimizers.Adam(
        schedule,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-8,
        clipnorm=_clipnorm
    )
    _model.compile(
        # Ignore the second output for LM logits
        loss=(PaddingLoss(), None),
        optimizer=optimizer
    )

    history = _model.fit(
        _train_dataset,
        validation_data=_valid_dataset,
        epochs=_epochs,
        callbacks=[
            keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True),
            keras.callbacks.TensorBoard(
                log_dir=tensorboard_dir,
                update_freq=100,
                profile_batch=0,
            )
        ],
        verbose=2,
    )


In [18]:
train(
    model,
    train_dataset,
    valid_dataset,
    epochs,
    warmup_steps,
    num_train_steps,
    max_learning_rate,
    clipnorm,
    tensorboard_dir
)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fbb447f41d8> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fbb447f41d8> is not a module, class, method, function, traceback, frame, or code object

254256/254256 - 83092s - loss: 4.1398 - logits_loss: 4.1398 - val_loss: 4.0160 - val_logits_loss: 4.0160
Epoch 2/10
254256/254256 - 82693s - loss: 3.5959 - logits_loss: 3.5959 - val_loss: 3.8376 - val_logits_loss: 3.8376
Epoch 3/10
254256/254256 - 82724s - loss: 3.4767 - logits_loss: 3.4767 - val_loss: 3.7474 - val_logits_loss: 3.7474
Epoch 4/10
254256/254256 - 82896s - loss: 3.4099 - logits_loss: 3.4099 - val_loss

In [19]:
model.summary()

Model: "tfgp_t2lm_head_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer (TFGPT2MainLayer multiple                  110418432 
Total params: 110,418,432
Trainable params: 110,418,432
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.save_pretrained(model_dir)