## Parameter

In [1]:
# These parameters can be injected from Papermill
model_type = "pre_ln"
train_file = "wikitext-103-raw/wiki.train.raw"
valid_file = "wikitext-103-raw/wiki.valid.raw"
epochs = 10
batch_size = 2
max_learning_rate = 1e-4
warmup_steps = 0
save_model_dir = "tfchat_model"
clipnorm = 1.0
fp16 = False

In [2]:
# Parameters
save_model_dir = "tfchat_model-gelu-lr_e4-clipnorm_none-fp16-batch_size_4"
clipnorm = None
fp16 = True
batch_size = 4


In [3]:
# Assert parameters
assert model_type in ["pre_ln", "post_ln", "min_gpt", "transformers"]

## Installation

In [4]:
!apt install -y git
!pip install git+https://github.com/noriyukipy/tfchat@ad516f2

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git is already the newest version (1:2.17.1-1ubuntu0.7).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Collecting git+https://github.com/noriyukipy/tfchat@ad516f2
  Cloning https://github.com/noriyukipy/tfchat (to revision ad516f2) to /tmp/pip-req-build-36yedm_q
Building wheels for collected packages: tfchat
  Building wheel for tfchat (setup.py) ... [?25ldone
[?25h  Created wheel for tfchat: filename=tfchat-0.1.0-py3-none-any.whl size=14227 sha256=5178f35c56fab31c29f7cca5413e0e945a74c9460b3145f645d0765f14e6ee5b
  Stored in directory: /tmp/pip-ephem-wheel-cache-d_a48qef/wheels/a5/ff/38/d84c5bb187a3949cd9f599b7d967bcc4a6f3fbff2cf029db4b
Successfully built tfchat
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


## Configure GPU

In [5]:
from tfchat.utils import set_memory_growth
from tfchat.utils import set_mixed_precision_policy

In [6]:
set_memory_growth()

Set memory growth to PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [7]:
if fp16:
    set_mixed_precision_policy()

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: GeForce RTX 2080 Ti, compute capability 7.5
Compute dtype: float16
Variable dtype: float32


## Setup tokenizer

In [8]:
# Install transformers by HuggingFace to use GPT2 tokenizer
! pip install transformers==3.4.0
# Enable widgetsnbextention to avoid the following error when running GPT2.from_pretrained method
#     ImportError: IProgress not found. Please update jupyter and ipywidgets.
! jupyter nbextension enable --py widgetsnbextension

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [9]:
# setup tokenizer
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

## Prepare model config

In [10]:
from tfchat.configs import GPT2SmallConfig

config = GPT2SmallConfig()

# Set the larger number of vocab size than 33,278, which is the vocab size of Wikitext-2
config.vocab_size = tokenizer.vocab_size

In [11]:
config

GPT2SmallConfig(num_layers=12, d_model=768, num_heads=12, d_ff=3072, vocab_size=50257, context_size=1024, attention_dropout_rate=0.1, residual_dropout_rate=0.1, embedding_dropout_rate=0.1, activation='gelu', kernel_initializer='he_normal', epsilon=1e-06)

## Prepare Dataset

In [12]:
from pathlib import Path
from urllib.request import urlretrieve
import zipfile
import numpy as np


def encode_file(_tokenizer, _filepath):
    ids = []
    with open(_filepath) as f:
        for line in f.readlines():
            text = line.strip("\n")
            ids.extend(_tokenizer.encode(text))

    return np.array(ids, dtype=np.int32)

In [13]:
train_ids = encode_file(tokenizer, train_file)
valid_ids = encode_file(tokenizer, valid_file)

Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 1024). Running this sequence through the model will result in indexing errors


In [14]:
print("Train:", train_ids.shape)
print("Valid:", valid_ids.shape)

Train: (116755111,)
Valid: (244828,)


In [15]:
print(train_ids.shape)
print(valid_ids.shape)

(116755111,)
(244828,)


In [16]:
from tfchat.data import BlockDataset


dataset = BlockDataset(block_size=config.context_size, batch_size=batch_size)

train_dataset = dataset.build(train_ids, shuffle=True)
valid_dataset = dataset.build(valid_ids, shuffle=False)

In [17]:
num_train_steps = len([_ for _ in train_dataset])
num_valid_steps = len([_ for _ in valid_dataset])
print("Train steps:", num_train_steps)
print("Valid steps:", num_valid_steps)

Train steps: 28504
Valid steps: 59


## Transformers model implementation

In [18]:
from transformers import TFGPT2LMHeadModel
from transformers import GPT2Config
import tensorflow.keras as keras
import tensorflow as tf
from tfchat.models import create_combined_mask

In [19]:
class TransformersGPT2(keras.Model):
    def __init__(self, config):
        super().__init__()
        tf_config = GPT2Config(
            n_layers=config.num_layers,
            n_embd=config.d_model,
            n_head=config.num_heads,
            n_inner=config.d_ff,
            vocab_size=config.vocab_size,
            n_ctx=config.context_size,
            n_positions=config.context_size,
            attn_pdrop=config.attention_dropout_rate,
            resid_pdrop=config.residual_dropout_rate,
            embd_pdrop=config.embedding_dropout_rate,
            layer_norm_epsilon=config.epsilon,
            activation_function="gelu_new",  # Default value of transformers implementation
            
        )
        self._decoder = TFGPT2LMHeadModel(tf_config)
        
    def call(self, inputs, training):
        inputs = tf.cast(inputs, tf.int32)
        x = self._decoder(inputs, training=training)
        return x[0]



## Prepare Model

In [20]:
from tfchat.losses import PaddingLoss
from tfchat.schedules import WarmupLinearDecay
import tensorflow.keras as keras



def train(_model, _train_dataset, _valid_dataset, _epochs, _warmup_steps, _num_train_steps, _max_learning_rate, _clipnorm):
    schedule = WarmupLinearDecay(max_learning_rate=_max_learning_rate,
                                 warmup_steps=_warmup_steps,
                                 training_steps=_num_train_steps*_epochs)
    optimizer = keras.optimizers.Adam(schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-8, clipnorm=_clipnorm)
    _model.compile(loss=PaddingLoss(), optimizer=optimizer)


    history = _model.fit(
        _train_dataset,
        validation_data=_valid_dataset,
        epochs=_epochs,
        callbacks=[
            keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True),
            # If you want to save chekcpoints, remove the next comment out
            #keras.callbacks.ModelCheckpoint("keras_model/", save_best_only=True)
        ],
        verbose=2,
    )


In [21]:
if model_type == "pre_ln":
    from tfchat.models import PreLNDecoder
    model = PreLNDecoder(config)
elif model_type == "post_ln":
    from tfchat.models import PostLNDecoder 
    model = PostLNDecoder(config)
elif model_type == "transformers":
    model = TransformersGPT2(config)
elif model_type == "min_gpt":
    from mingpt.model import GPT, GPTConfig
    mconf = GPTConfig(config.vocab_size, config.context_size,
                      n_layer=config.num_layers, n_head=config.num_heads, n_embd=config.d_model)
    model = GPT(mconf)
else:
    raise Exception("Model type is wrong")

In [22]:
model.build(input_shape=(None, config.context_size))
model.summary()

Model: "pre_ln_decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder (Decoder)            multiple                  123614976 
Total params: 123,614,976
Trainable params: 123,614,976
Non-trainable params: 0
_________________________________________________________________


In [23]:
train(model, train_dataset, valid_dataset, epochs, warmup_steps, num_train_steps, max_learning_rate, clipnorm)

Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


28504/28504 - 13045s - loss: 4.7957 - val_loss: 3.8134
Epoch 2/10
28504/28504 - 13035s - loss: 3.7186 - val_loss: 3.4305
Epoch 3/10
28504/28504 - 13140s - loss: 3.4345 - val_loss: 3.2779
Epoch 4/10
28504/28504 - 13147s - loss: 3.2802 - val_loss: 3.1900
Epoch 5/10
28504/28504 - 13150s - loss: 3.1743 - val_loss: 3.1315
Epoch 6/10
28504/28504 - 13128s - loss: 3.0937 - val_loss: 3.0914
Epoch 7/10
28504/28504 - 13139s - loss: 3.0287 - val_loss: 3.0637
Epoch 8/10
28504/28504 - 13065s - loss: 2.9746 - val_loss: 3.0409
Epoch 9/10
28504/28504 - 13115s - loss: 2.9291 - val_loss: 3.0223
Epoch 10/10
28504/28504 - 13202s - loss: 2.8922 - val_loss: 3.0132


In [24]:
from tfchat.eval import perplexity

print("Validation PPL:", perplexity(model, valid_dataset))

{'loss': 3.0132132, 'perplexity': 20.35269, 'num_batches': 59, 'num_tokens': 241664}
Validation PPL: 20.35269


In [25]:
from tfchat.utils import save_model

save_model(save_model_dir, model, config)