In [22]:
import re
import os
import keras_nlp
import keras
import numpy as np

import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
import tensorflow.keras.optimizers.schedules as schedules
from keras.layers import LeakyReLU

import pickle

In [2]:
# Data Parameters
BATCH_SIZE = 16
MIN_STRING_LEN = 32  # Strings shorter than this will be discarded
SEQ_LEN = 512  # Length of training sequences, in tokens

# Model
EMBED_DIM = 1024
FEED_FORWARD_DIM = 1024
NUM_HEADS = 16
NUM_LAYERS = 20
VOCAB_SIZE = 50000  # Limits parameters in model.

# Training
EPOCHS = 1



In [3]:
raw_train_ds = (
    #tf_data.TextLineDataset('./dataset/wikitext-103/WikiQA-train.txt')
    tf_data.TextLineDataset('./dataset/squadtrain-v2.0.txt')
#    tf_data.TextLineDataset('./dataset/wikitext-103/wiki.train300klines.tokens')
    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)

raw_val_ds = (
    tf_data.TextLineDataset('./dataset/WikiQA-dev.txt')
    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
    .batch(BATCH_SIZE)
)

In [4]:
with open('./models/wikitext103_tokenizer50k.pkl', 'rb') as saved_tokenizer:
    tokenizer = pickle.load(saved_tokenizer)

In [5]:
# packer adds a start token
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels


# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)

In [6]:
inputs = keras.layers.Input(shape=(None,), dtype="int32")
# Embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)
# Transformer decoders.
for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
        activation=LeakyReLU(0.1),
    )
    x = decoder_layer(x)  # Giving one argument only skips cross-attention.
# Output.
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#learningrate = schedules.PolynomialDecay(5e-5, decay_steps=50000, end_learning_rate=2e-5)
opt = keras.optimizers.Adam(learning_rate=5e-5)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer=opt, loss=loss_fn, metrics=[perplexity])


In [45]:
model.load_weights('./checkpoints/Model_240M_50kvocab_10EpochSquadfinetune.krs')
keras.backend.set_value(model.optimizer.learning_rate, 1e-6)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 1024)       51724288  
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_decoder (Transf  (None, None, 1024)       6301696   
 ormerDecoder)                                                   
                                                                 
 transformer_decoder_1 (Tran  (None, None, 1024)       6301696   
 sformerDecoder)                                                 
                                                                 
 transformer_decoder_2 (Tran  (None, None, 1024)       630169

In [8]:
# 733,728 / batch size = num steps
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)



<keras.callbacks.History at 0x265e0617880>

In [9]:
model.save("./checkpoints/Model_240M_50kvocab_10EpochSquadfinetune.krs")




INFO:tensorflow:Assets written to: ./checkpoints/Model_240M_50kvocab_10EpochSquadfinetune.krs\assets


INFO:tensorflow:Assets written to: ./checkpoints/Model_240M_50kvocab_10EpochSquadfinetune.krs\assets


In [10]:
# 733,728 / batch size = num steps
model.fit(train_ds, validation_data=val_ds, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15

KeyboardInterrupt: 

In [11]:
model.save("./checkpoints/Model_240M_50kvocab_30ksSquadfinetune.krs")
#loaded_model = keras.saving.load_model("model.keras")



INFO:tensorflow:Assets written to: ./checkpoints/Model_240M_50kvocab_30ksSquadfinetune.krs\assets


INFO:tensorflow:Assets written to: ./checkpoints/Model_240M_50kvocab_30ksSquadfinetune.krs\assets


8

In [57]:
prompt_tokens = start_packer(tokenizer(["What genre of music do the beatles play?"]))
prompt_length = np.sum(prompt_tokens.numpy() != 0)
prompt_tokens

<tf.Tensor: shape=(1, 512), dtype=int32, numpy=
array([[   2, 1146, 3178,  984, 1097, 1255,  983, 3703, 1251,   33,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   

In [53]:
# Inference
NUM_TOKENS_TO_GENERATE = 40

In [14]:
def next(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    # Ignore hidden states for now; only needed for contrastive search.
    hidden_states = None
    return logits, hidden_states, cache

In [58]:
sampler = keras_nlp.samplers.TopPSampler(p=0.5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=prompt_length,
)
txt = tokenizer.detokenize(output_tokens)
output_txt = str(txt.numpy()[0]).split("b'[BOS] ")[-1].split(' [PAD]')[0]
print(f"Top-P search generated text: \n{output_txt}\n")

Top-P search generated text: 
what genre of music do the beatles play ? rock and roll



In [9]:
model.save("./checkpoints/Model_240M_50kvocab_10EpochWikiQAfinetune.krs")
#loaded_model = keras.saving.load_model("model.keras")



INFO:tensorflow:Assets written to: ./checkpoints/Model_240M_50kvocab_10EpochWikiQAfinetune.krs\assets


INFO:tensorflow:Assets written to: ./checkpoints/Model_240M_50kvocab_10EpochWikiQAfinetune.krs\assets


In [42]:
print(txt)

tf.Tensor([b'[BOS] what is the capital city of america ? northwestern [PAD]rt [PAD]n [PAD]d [PAD]nza [PAD] [PAD]n [PAD]n [PAD]nn [PAD]n [PAD]n [PAD]nni [PAD] [PAD]s [PAD]l [PAD]ll - d - nw [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

'what are the three rock types ?'

In [24]:
output_tokens

<tf.Tensor: shape=(1, 512), dtype=int32, numpy=
array([[    2,  1146,  1014,   983,  1046,  1420,  3009,    33,  4629,
         1420,    14,  1420,    14,   985,  1801,  1097,     0,  1420,
            0,  1020,     0,  1020,     0,   985,  3604,  1294,     0,
         1020,     0,  1420,   985,  1801,     0,     0,     0,  1020,
            0,  1520,   985, 13085,     0,     0,     0,     0,     0,
            0,     0,     0,     0,  1020,     0,  1420,    16,     0,
          985,  1011,  1897,   989,   983,  2971,  1097,  1840,     0,
          985,  1840,     0,     0,     9,    59,  8138,   984,  2852,
         1097,  1840,     0,  1020,     0,     9,    14,   995,  1049,
           41,  2655,  2832,   986,  3193,     0,  1240,     0,  1420,
         1097,     0,  1020,   986,   983,  2380,  1152,    16,     0,
            9, 14808,   985,     9, 14808,     0,  3396, 15896, 11048,
            0,  1020,     0,  1420,     0,  1020,    14,  4013,   985,
         1317,  1097,     0, 