## Parameter

In [1]:
# These parameters can be injected from Papermill
model_type = "pre_ln"
train_file = "wikitext-103-raw/wiki.train.raw"
valid_file = "wikitext-103-raw/wiki.valid.raw"
epochs = 10
batch_size = 2
max_learning_rate = 1e-4
warmup_steps = 0
save_model_dir = "tfchat_model"

In [2]:
# Parameters
save_model_dir = "tfchat_model-gelu-lr_e4"


In [3]:
# Assert parameters
assert model_type in ["pre_ln", "post_ln", "min_gpt", "transformers"]

## Installation

In [4]:
!apt install -y git
!pip install git+https://github.com/noriyukipy/tfchat@change_default_gelu

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  git-man less libbsd0 libcurl3-gnutls libedit2 liberror-perl libssl1.0.0
  libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxmuu1
  openssh-client xauth
Suggested packages:
  gettext-base git-daemon-run | git-daemon-sysvinit git-doc git-el git-email
  git-gui gitk gitweb git-cvs git-mediawiki git-svn keychain libpam-ssh
  monkeysphere ssh-askpass
The following NEW packages will be installed:
  git git-man less libbsd0 libcurl3-gnutls libedit2 liberror-perl libssl1.0.0
  libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxmuu1
  openssh-client xauth
0 upgraded, 17 newly installed, 0 to remove and 30 not upgraded.
Need to get 7698 kB of archives.
After this operation, 46.4 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libxau6 amd64 1:1.0.8-1ubuntu1 [7556 B]
Get:2

## Configure GPU

In [5]:
from tfchat.utils import set_memory_growth

In [6]:
set_memory_growth()

Set memory growth to PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


## Setup tokenizer

In [7]:
# Install transformers by HuggingFace to use GPT2 tokenizer
! pip install transformers==3.4.0
# Enable widgetsnbextention to avoid the following error when running GPT2.from_pretrained method
#     ImportError: IProgress not found. Please update jupyter and ipywidgets.
! jupyter nbextension enable --py widgetsnbextension

Collecting transformers==3.4.0
  Downloading transformers-3.4.0-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 10.0 MB/s eta 0:00:01
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 11.0 MB/s eta 0:00:01
Collecting tokenizers==0.9.2
  Downloading tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 10.4 MB/s eta 0:00:01
Collecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting joblib
  Downloading joblib-0.17.0-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 10.3 MB/s eta 0:00:01
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.43-py3-none-any.whl size=894090 sha256=31c6b3f012b4b677a022a2208b9b757b4a66d9c55fb398afb3c7deb4d8c72178
  Stored in directory: /roo

In [8]:
# setup tokenizer
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1042301.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




## Prepare model config

In [9]:
from tfchat.configs import GPT2SmallConfig

config = GPT2SmallConfig()

# Set the larger number of vocab size than 33,278, which is the vocab size of Wikitext-2
config.vocab_size = tokenizer.vocab_size

In [10]:
config

GPT2SmallConfig(num_layers=12, d_model=768, num_heads=12, d_ff=3072, vocab_size=50257, context_size=1024, attention_dropout_rate=0.1, residual_dropout_rate=0.1, embedding_dropout_rate=0.1, activation='gelu', kernel_initializer='he_normal', epsilon=1e-06)

## Prepare Dataset

In [11]:
from pathlib import Path
from urllib.request import urlretrieve
import zipfile
import numpy as np


def encode_file(_tokenizer, _filepath):
    ids = []
    with open(_filepath) as f:
        for line in f.readlines():
            text = line.strip("\n")
            ids.extend(_tokenizer.encode(text))

    return np.array(ids, dtype=np.int32)

In [12]:
train_ids = encode_file(tokenizer, train_file)
valid_ids = encode_file(tokenizer, valid_file)

Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 1024). Running this sequence through the model will result in indexing errors


In [13]:
print("Train:", train_ids.shape)
print("Valid:", valid_ids.shape)

Train: (116755111,)
Valid: (244828,)


In [14]:
print(train_ids.shape)
print(valid_ids.shape)

(116755111,)
(244828,)


In [15]:
from tfchat.data import BlockDataset


dataset = BlockDataset(block_size=config.context_size, batch_size=batch_size)

train_dataset = dataset.build(train_ids, shuffle=True)
valid_dataset = dataset.build(valid_ids, shuffle=False)

In [16]:
num_train_steps = len([_ for _ in train_dataset])
num_valid_steps = len([_ for _ in valid_dataset])
print("Train steps:", num_train_steps)
print("Valid steps:", num_valid_steps)

Train steps: 57009
Valid steps: 119


## Transformers model implementation

In [17]:
from transformers import TFGPT2LMHeadModel
from transformers import GPT2Config
import tensorflow.keras as keras
import tensorflow as tf
from tfchat.models import create_combined_mask

In [18]:
class TransformersGPT2(keras.Model):
    def __init__(self, config):
        super().__init__()
        tf_config = GPT2Config(
            n_layers=config.num_layers,
            n_embd=config.d_model,
            n_head=config.num_heads,
            n_inner=config.d_ff,
            vocab_size=config.vocab_size,
            n_ctx=config.context_size,
            n_positions=config.context_size,
            attn_pdrop=config.attention_dropout_rate,
            resid_pdrop=config.residual_dropout_rate,
            embd_pdrop=config.embedding_dropout_rate,
            layer_norm_epsilon=config.epsilon,
            activation_function="gelu_new",  # Default value of transformers implementation
            
        )
        self._decoder = TFGPT2LMHeadModel(tf_config)
        
    def call(self, inputs, training):
        inputs = tf.cast(inputs, tf.int32)
        x = self._decoder(inputs, training=training)
        return x[0]



## Prepare Model

In [19]:
from tfchat.losses import PaddingLoss
from tfchat.schedules import WarmupLinearDecay
import tensorflow.keras as keras



def train(_model, _train_dataset, _valid_dataset, _epochs, _warmup_steps, _num_train_steps, _max_learning_rate):
    schedule = WarmupLinearDecay(max_learning_rate=_max_learning_rate,
                                 warmup_steps=_warmup_steps,
                                 training_steps=_num_train_steps*_epochs)
    optimizer = keras.optimizers.Adam(schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-8, clipnorm=1.0)
    _model.compile(loss=PaddingLoss(), optimizer=optimizer)


    history = _model.fit(
        _train_dataset,
        validation_data=_valid_dataset,
        epochs=_epochs,
        callbacks=[
            keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True),
            # If you want to save chekcpoints, remove the next comment out
            #keras.callbacks.ModelCheckpoint("keras_model/", save_best_only=True)
        ],
        verbose=2,
    )


In [20]:
if model_type == "pre_ln":
    from tfchat.models import PreLNDecoder
    model = PreLNDecoder(config)
elif model_type == "post_ln":
    from tfchat.models import PostLNDecoder 
    model = PostLNDecoder(config)
elif model_type == "transformers":
    model = TransformersGPT2(config)
elif model_type == "min_gpt":
    from mingpt.model import GPT, GPTConfig
    mconf = GPTConfig(config.vocab_size, config.context_size,
                      n_layer=config.num_layers, n_head=config.num_heads, n_embd=config.d_model)
    model = GPT(mconf)
else:
    raise Exception("Model type is wrong")

In [21]:
model.build(input_shape=(None, config.context_size))
model.summary()

Model: "pre_ln_decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder (Decoder)            multiple                  123614976 
Total params: 123,614,976
Trainable params: 123,614,976
Non-trainable params: 0
_________________________________________________________________


In [22]:
train(model, train_dataset, valid_dataset, epochs, warmup_steps, num_train_steps, max_learning_rate)

Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


57009/57009 - 24675s - loss: 4.5498 - val_loss: 3.6965
Epoch 2/10
57009/57009 - 24679s - loss: 3.6455 - val_loss: 3.3910
Epoch 3/10
57009/57009 - 24730s - loss: 3.4026 - val_loss: 3.2567
Epoch 4/10
57009/57009 - 24756s - loss: 3.2602 - val_loss: 3.1793
Epoch 5/10
57009/57009 - 24691s - loss: 3.1583 - val_loss: 3.1222
Epoch 6/10
57009/57009 - 24698s - loss: 3.0781 - val_loss: 3.0868
Epoch 7/10
57009/57009 - 24778s - loss: 3.0111 - val_loss: 3.0573
Epoch 8/10
57009/57009 - 24769s - loss: 2.9533 - val_loss: 3.0331
Epoch 9/10
57009/57009 - 24700s - loss: 2.9027 - val_loss: 3.0153
Epoch 10/10
57009/57009 - 24741s - loss: 2.8596 - val_loss: 3.0021


In [23]:
from tfchat.eval import perplexity

print("Validation PPL:", perplexity(model, valid_dataset))

{'loss': 3.002081, 'perplexity': 20.127377, 'num_batches': 119, 'num_tokens': 243712}
Validation PPL: 20.127377


In [24]:
from tfchat.utils import save_model

save_model(save_model_dir, model, config)