In [1]:
from tfchat.utils import set_memory_growth

In [2]:
set_memory_growth()

Set memory growth to PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


## Setup tokenizer

In [3]:
# Install transformers by HuggingFace to use GPT2 tokenizer
! pip install transformers==3.4.0
# Enable widgetsnbextention to avoid the following error when running GPT2.from_pretrained method
#     ImportError: IProgress not found. Please update jupyter and ipywidgets.
! jupyter nbextension enable --py widgetsnbextension

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [4]:
# setup tokenizer
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

## Prepare model config

In [5]:
from tfchat.configs import GPT2SmallConfig

config = GPT2SmallConfig()

# Set the larger number of vocab size than 33,278, which is the vocab size of Wikitext-2
config.vocab_size = tokenizer.vocab_size

In [6]:
config

GPT2SmallConfig(num_layers=12, d_model=768, num_heads=12, d_ff=3072, vocab_size=50257, context_size=1024, attention_dropout_rate=0.1, residual_dropout_rate=0.1, embedding_dropout_rate=0.1, epsilon=1e-06)

## Prepare Dataset

In [7]:
from pathlib import Path
from urllib.request import urlretrieve
import zipfile
import numpy as np


def encode_file(tokenizer, filepath):
    ids = []
    with open(filepath) as f:
        for line in f.readlines():
            text = line.strip("\n")
            ids.extend(tokenizer.encode(text))

    return np.array(ids, dtype=np.int32)

In [8]:
train_ids = encode_file(tokenizer, "wikitext-2/wiki.train.tokens")
valid_ids = encode_file(tokenizer, "wikitext-2/wiki.valid.tokens")

In [9]:
print("Train:", train_ids.shape)
print("Valid:", valid_ids.shape)

Train: (2398713,)
Valid: (253600,)


In [10]:
print(train_ids.shape)
print(valid_ids.shape)

(2398713,)
(253600,)


In [11]:
from tfchat.data import BlockDataset


dataset = BlockDataset(block_size=config.context_size, batch_size=2)

train_dataset = dataset.build(train_ids, shuffle=True)
test_dataset = dataset.build(valid_ids, shuffle=False)

In [12]:
#print("Train size:", len(train_dataset))
#print("Test size:", len(test_dataset))

## Prepare Model

In [13]:
from tfchat.metrics import perplexity
from tfchat.losses import PaddingLoss
from tfchat.optimizers import TransformerScheduler
import tensorflow.keras as keras



def train(model):
    model.compile(loss=PaddingLoss(),
                  optimizer=keras.optimizers.Adam(TransformerScheduler(d_model=config.d_model, warmup_steps=800),
                                                  beta_1=0.9,
                                                  beta_2=0.999,
                                                  epsilon=1e-8,
                                                  clipnorm=1.0,
                                                 ),
                  metrics=[keras.metrics.SparseCategoricalAccuracy()],
                 )
    model.build(input_shape=(None, config.context_size))
    model.summary()

    history = model.fit(
        train_dataset,
        validation_data=test_dataset,
        epochs=20,
        callbacks=[
            keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True),
            keras.callbacks.ModelCheckpoint("keras_model", save_best_only=True)
        ]
    )
    return perplexity(model, test_dataset)

### Train PostLN Model

In [14]:
#from tfchat.models import PostLNDecoder 

#model = PostLNDecoder(config)

In [15]:
#train(model)

### Train PreLN Model

In [16]:
from tfchat.models import PreLNDecoder
model = PreLNDecoder(config)

In [17]:
train(model)

Model: "pre_ln_decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder (Decoder)            multiple                  162299473 
Total params: 162,299,473
Trainable params: 162,299,473
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
{'loss': 4.629918, 'perplexity': 102.50567, 'num_batches': 123, 'num_tokens': 251904}


102.50567

In [20]:
from tfchat.generations import TopKTopPGenerator

gen = TopKTopPGenerator(model=model, max_len=20)
inputs = np.array([tokenizer.encode("I am")], dtype=np.int32)

outputs = gen.generate(inputs)
print(tokenizer.decode(outputs[0]))

I am "'s great great feeling the time and <unk>. The use of £ 3 million injuries


### Train with minGPT-TF

In [22]:
# https://github.com/kamalkraj/minGPT-TF
from mingpt.model import GPT, GPTConfig

mconf = GPTConfig(config.vocab_size, config.context_size,
                  n_layer=config.num_layers, n_head=config.num_heads, n_embd=config.d_model)
model = GPT(mconf)

In [23]:
train(model)

Model: "gpt"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  38597376  
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
encoder_layer (EncoderLayer) multiple                  7087872   
_________________________________________________________________
encoder_layer_1 (EncoderLaye multiple                  7087872   
_________________________________________________________________
encoder_layer_2 (EncoderLaye multiple                  7087872   
_________________________________________________________________
encoder_layer_3 (EncoderLaye multiple                  7087872   
_________________________________________________________________
encoder_layer_4 (EncoderLaye multiple                  7087872 

151.67773