## Parameter

In [1]:
# These parameters can be injected from Papermill
model_type = "pre_ln"
train_file = "wikitext-103-raw/wiki.train.raw"
valid_file = "wikitext-103-raw/wiki.valid.raw"
epochs = 10
batch_size = 2
max_learning_rate = 1e-4
warmup_steps = 0
clipnorm = 1.0
fp16 = False
save_model_dir = f"output/tfdlg_train-{model_type}-model"
tensorboard_dir = f"output/tensorboard/{save_model_dir}-tensorboard"

In [2]:
# Parameters
model_type = "pre_ln"
batch_size = 4
fp16 = True


In [3]:
# Assert parameters
assert model_type in ["pre_ln", "post_ln", "transformers"]

## Configure GPU

In [4]:
from tfdlg.utils import set_memory_growth
from tfdlg.utils import set_mixed_precision_policy

In [5]:
set_memory_growth()

Set memory growth to PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [6]:
if fp16:
    set_mixed_precision_policy()

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: GeForce RTX 2080 Ti, compute capability 7.5
Instructions for updating:
Use tf.keras.mixed_precision.LossScaleOptimizer instead. LossScaleOptimizer now has all the functionality of DynamicLossScale
Compute dtype: float16
Variable dtype: float32


## Setup tokenizer

In [7]:
# Install transformers by HuggingFace to use GPT2 tokenizer
! pip install transformers==3.4.0
# Enable widgetsnbextention to avoid the following error when running GPT2.from_pretrained method
#     ImportError: IProgress not found. Please update jupyter and ipywidgets.
! jupyter nbextension enable --py widgetsnbextension

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [8]:
# setup tokenizer
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

## Prepare model config

In [9]:
from tfdlg.configs import GPT2SmallConfig

config = GPT2SmallConfig()

# Set the larger number of vocab size than 33,278, which is the vocab size of Wikitext-2
config.vocab_size = tokenizer.vocab_size

In [10]:
config

GPT2SmallConfig(num_layers=12, d_model=768, num_heads=12, d_ff=3072, vocab_size=50257, context_size=1024, attention_dropout_rate=0.1, residual_dropout_rate=0.1, embedding_dropout_rate=0.1, activation='gelu', kernel_initializer='he_normal', epsilon=1e-06)

## Prepare Dataset

In [11]:
from pathlib import Path
from urllib.request import urlretrieve
import zipfile
import numpy as np


def read_file(_filepath):
    return (t.strip("\n") for t in open(_filepath))


In [12]:
from tfdlg.data import BlockDataset


train_dataset = BlockDataset.from_generator(
    generator=lambda: read_file(train_file),
    encode_fn=tokenizer.encode,
    block_size=config.context_size,
    batch_size=batch_size,
    shuffle=True
)
valid_dataset = BlockDataset.from_generator(
    generator=lambda: read_file(valid_file),
    encode_fn=tokenizer.encode,
    block_size=config.context_size,
    batch_size=batch_size,
    shuffle=False
)

In [13]:
num_train_steps = sum(1 for _ in train_dataset)
num_valid_steps = sum(1 for _ in valid_dataset)
print("Train steps:", num_train_steps)
print("Valid steps:", num_valid_steps)

Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 1024). Running this sequence through the model will result in indexing errors


Train steps: 28504
Valid steps: 59


## Transformers model implementation

In [14]:
from transformers import TFGPT2LMHeadModel
from transformers import GPT2Config
import tensorflow.keras as keras
import tensorflow as tf

In [15]:
class TransformersGPT2(keras.Model):
    def __init__(self, config):
        super().__init__()
        tf_config = GPT2Config(
            n_layers=config.num_layers,
            n_embd=config.d_model,
            n_head=config.num_heads,
            n_inner=config.d_ff,
            vocab_size=config.vocab_size,
            n_ctx=config.context_size,
            n_positions=config.context_size,
            attn_pdrop=config.attention_dropout_rate,
            resid_pdrop=config.residual_dropout_rate,
            embd_pdrop=config.embedding_dropout_rate,
            layer_norm_epsilon=config.epsilon,
            activation_function="gelu_new",  # Default value of transformers implementation
            
        )
        self._decoder = TFGPT2LMHeadModel(tf_config)
        
    def call(self, inputs, training):
        inputs = tf.cast(inputs, tf.int32)
        x = self._decoder(inputs, training=training)
        return x[0]



## Prepare Model

In [16]:
from tfdlg.losses import PaddingLoss
from tfdlg.schedules import WarmupLinearDecay
import tensorflow.keras as keras



def train(
    _model,
    _train_dataset,
    _valid_dataset,
    _epochs,
    _warmup_steps,
    _num_train_steps,
    _max_learning_rate,
    _clipnorm,
    _tensorboard_dir
):
    schedule = WarmupLinearDecay(
        max_learning_rate=_max_learning_rate,
        warmup_steps=_warmup_steps,
        training_steps=_num_train_steps*_epochs
    )
    optimizer = keras.optimizers.Adam(schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-8, clipnorm=_clipnorm)
    _model.compile(loss=PaddingLoss(), optimizer=optimizer)

    history = _model.fit(
        _train_dataset,
        validation_data=_valid_dataset,
        epochs=_epochs,
        callbacks=[
            keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True),
            keras.callbacks.TensorBoard(
                log_dir=tensorboard_dir,
                update_freq=100,
                profile_batch=0,
            )
        ],
        verbose=2,
    )


In [17]:
if model_type == "pre_ln":
    from tfdlg.models import PreLNDecoder
    model = PreLNDecoder(config)
elif model_type == "post_ln":
    from tfdlg.models import PostLNDecoder 
    model = PostLNDecoder(config)
elif model_type == "transformers":
    model = TransformersGPT2(config)
else:
    raise Exception("Model type is wrong")

In [18]:
model.build(input_shape=(None, config.context_size))
model.summary()

Model: "pre_ln_decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder (Decoder)            multiple                  123614976 
Total params: 123,614,976
Trainable params: 123,614,976
Non-trainable params: 0
_________________________________________________________________


In [19]:
train(
    model,
    train_dataset,
    valid_dataset,
    epochs,
    warmup_steps,
    num_train_steps,
    max_learning_rate,
    clipnorm,
    tensorboard_dir
)

  opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt)

Epoch 1/10
28504/28504 - 13715s - loss: 4.7492 - val_loss: 3.8091
Epoch 2/10
28504/28504 - 13705s - loss: 3.7234 - val_loss: 3.4365
Epoch 3/10
28504/28504 - 13709s - loss: 3.4467 - val_loss: 3.2843
Epoch 4/10
28504/28504 - 13701s - loss: 3.2927 - val_loss: 3.1959
Epoch 5/10
28504/28504 - 13659s - loss: 3.1860 - val_loss: 3.1417
Epoch 6/10
28504/28504 - 13728s - loss: 3.1045 - val_loss: 3.1050
Epoch 7/10
28504/28504 - 13760s - loss: 3.0388 - val_loss: 3.0735
Epoch 8/10
28504/28504 - 13790s - loss: 2.9839 - val_loss: 3.0507
Epoch 9/10
28504/28504 - 13749s - loss: 2.9378 - val_loss: 3.0340
Epoch 10/10
28504/28504 - 13662s - loss: 2.9006 - val_loss: 3.0232


In [20]:
from tfdlg.eval import perplexity

print("Validation PPL:", perplexity(model, valid_dataset))

{'loss': 3.0231893, 'perplexity': 20.55675, 'num_batches': 59, 'num_tokens': 241664}
Validation PPL: 20.55675


In [21]:
from tfdlg.utils import save_model

save_model(save_model_dir, model, config)