In [1]:
import tensorflow as tf

2023-10-31 12:50:44.883040: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tf.__version__

'2.12.0'

In [3]:
from tensorflow import keras

In [4]:
import keras_nlp

ModuleNotFoundError: No module named 'keras_nlp'

In [5]:
import numpy as np

In [6]:
crime_and_punishment_url = 'https://www.gutenberg.org/files/2554/2554-0.txt'
brothers_of_karamazov_url = 'https://www.gutenberg.org/files/28054/28054-0.txt'
the_idiot_url = 'https://www.gutenberg.org/files/2638/2638-0.txt'
the_possessed_url = 'https://www.gutenberg.org/files/8117/8117-0.txt'

In [7]:
paths = [crime_and_punishment_url, brothers_of_karamazov_url, the_idiot_url, the_possessed_url]
names = ['Crime and Punishment', 'Brothers of Karamazov', 'The Idiot', 'The Possessed']
texts = ''
for index, path in enumerate(paths):
    filepath = keras.utils.get_file(f'{names[index]}.txt', origin=path)
    text = ''
    with open(filepath, encoding='utf-8') as f:
        text = f.read()
        # First 50 lines are the Gutenberg intro and preface
        # Skipping first 10k characters for each book should be approximately
        # removing the intros and prefaces.
        texts += text[10000:]

In [8]:
texts[25000:25500]

'nd that was why\nI addressed you at once. For in unfolding to you the story of my life, I\ndo not wish to make myself a laughing-stock before these idle listeners,\nwho indeed know all about it already, but I am looking for a man\nof feeling and education. Know then that my wife was educated in a\nhigh-class school for the daughters of noblemen, and on leaving she\ndanced the shawl dance before the governor and other personages for\nwhich she was presented with a gold medal and a certificate of merit.\n'

In [9]:
text_list = texts.split('.')
len(text_list)

69176

In [10]:
len(texts.replace('\n', ' ').split(' '))

1077543

In [11]:
text_list = list(filter(None, text_list))

In [12]:
import random
random.shuffle(text_list)

In [13]:
length = len(text_list)

In [14]:
text_train = text_list[:int(0.7*length)]
text_test = text_list[int(0.7*length):int(0.85*length)]
text_valid = text_list[int(0.85*length):]

In [15]:
from tensorflow.keras.layers import TextVectorization

In [16]:
def custom_standardization(input_string):
    sentence = tf.strings.lower(input_string)
    sentence = tf.strings.regex_replace(sentence, "\n", " ")
    return sentence

In [17]:
maxlen = 50

In [18]:
vectorize_layer = TextVectorization(
    standardize = custom_standardization,
    output_mode = "int",
    output_sequence_length = maxlen + 1,
)

2023-08-31 23:38:41.100313: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-31 23:38:41.404925: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [19]:
vectorize_layer.adapt(text_list)

In [20]:
vocab = vectorize_layer.get_vocabulary()

In [21]:
vocab_size = len(vocab)

In [22]:
vocab_size

49715

In [23]:
index_lookup = dict(zip(range(len(vocab)), vocab))

In [24]:
index_lookup[5]

'of'

In [25]:
vectorize_layer(['hello world!'])

<tf.Tensor: shape=(1, 51), dtype=int64, numpy=
array([[   1, 7509,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

In [26]:
batch_size = 64

In [27]:
train_dataset = tf.data.Dataset.from_tensor_slices(text_train)
train_dataset = train_dataset.shuffle(buffer_size=256)
train_dataset = train_dataset.batch(batch_size)

In [28]:
test_dataset = tf.data.Dataset.from_tensor_slices(text_test)
test_dataset = test_dataset.shuffle(buffer_size=256)
test_dataset = test_dataset.batch(batch_size)

In [29]:
valid_dataset = tf.data.Dataset.from_tensor_slices(text_valid)
valid_dataset = valid_dataset.shuffle(buffer_size=256)
valid_dataset = valid_dataset.batch(batch_size)

In [30]:
def preprocess_text(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_dataset = train_dataset.map(preprocess_text)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

test_dataset = test_dataset.map(preprocess_text)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

valid_dataset = valid_dataset.map(preprocess_text)
valid_dataset = valid_dataset.prefetch(tf.data.AUTOTUNE)

In [31]:
for entry in train_dataset.take(1):
    print(entry)

(<tf.Tensor: shape=(64, 50), dtype=int64, numpy=
array([[25061,  2632, 40263, ...,     0,     0,     0],
       [ 3671,     7,    64, ...,     0,     0,     0],
       [  727,   639,  8561, ...,     0,     0,     0],
       ...,
       [   68,   632,     8, ...,     0,     0,     0],
       [   17, 13535,     6, ...,   206,     0,     0],
       [   14,  6417,    42, ...,     0,     0,     0]])>, <tf.Tensor: shape=(64, 50), dtype=int64, numpy=
array([[ 2632, 40263, 13526, ...,     0,     0,     0],
       [    7,    64,    19, ...,     0,     0,     0],
       [  639,  8561,     3, ...,     0,     0,     0],
       ...,
       [  632,     8,   114, ...,     0,     0,     0],
       [13535,     6,    78, ...,     0,     0,     0],
       [ 6417,    42, 37165, ...,     0,     0,     0]])>)


In [32]:
embed_dim = 128
num_heads = 4

In [33]:
def create_model():
    inputs = keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    x = keras_nlp.layers.TokenAndPositionEmbedding(vocab_size, maxlen, embed_dim)(inputs)
    for i in range(1):
        x = keras_nlp.layers.TransformerDecoder(intermediate_dim=embed_dim*2,
                                                  num_heads=num_heads)(x)
    do = keras.layers.Dropout(0.4)(x)
    outputs = keras.layers.Dense(vocab_size, activation='softmax')(do)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer="adam",
        loss='sparse_categorical_crossentropy',
        metrics=[keras_nlp.metrics.Perplexity(), 'accuracy']
    )
    return model

In [34]:
model = create_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 token_and_position_embeddi  (None, 50, 128)           6369920   
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 transformer_decoder (Trans  (None, 50, 128)           132480    
 formerDecoder)                                                  
                                                                 
 dropout_3 (Dropout)         (None, 50, 128)           0         
                                                                 
 dense_2 (Dense)             (None, 50, 49715)         6413235   
                                                             

In [35]:
class TextSampler(keras.callbacks.Callback):
    def __init__(self, start_prompt, max_tokens):
        self.start_prompt = start_prompt
        self.max_tokens = max_tokens
        
    def sample_token(self, logits):
        logits, indices = tf.math.top_k(logits, k=5, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)
    
    def on_epoch_end(self, epoch, logs=None):
        decoded_sample = self.start_prompt
        
        for i in range(self.max_tokens-1):
            tokenized_prompt = vectorize_layer([decoded_sample])[:, :-1]
            predictions = self.model.predict([tokenized_prompt], verbose=0)
            sample_index = len(decoded_sample.strip().split())-1
            
            sampled_token = self.sample_token(predictions[0][sample_index])
            sampled_token = index_lookup[sampled_token]
            decoded_sample += " " + sampled_token
            
        print(f"\nSample text:\n{decoded_sample}...\n")

In [36]:
random_sentence = ' '.join(random.choice(text_valid).replace('\n', ' ').split(' ')[:4])

In [37]:
sampler = TextSampler(random_sentence, 30)

In [38]:
reducelr = keras.callbacks.ReduceLROnPlateau(patience=10, monitor='val_loss')

In [39]:
history = model.fit(train_dataset,
                    validation_data=valid_dataset,
                    epochs=10,
                    callbacks=[sampler, reducelr])

Epoch 1/10
137/633 [=====>........................] - ETA: 21:52 - loss: 4.1537 - perplexity: 63.6664 - accuracy: 0.6518

KeyboardInterrupt: 

In [None]:
def sample_token(logits):
        logits, indices = tf.math.top_k(logits, k=5, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

def generate_text(prompt, response_length=20):
    decoded_sample = prompt
    for i in range(response_length-1):
        tokenized_prompt = vectorize_layer([decoded_sample])[:, :-1]
        predictions = model.predict([tokenized_prompt], verbose=0)
        sample_index = len(decoded_sample.strip().split())-1

        sampled_token = sample_token(predictions[0][sample_index])
        sampled_token = index_lookup[sampled_token]
        decoded_sample += " " + sampled_token
    return decoded_sample

In [None]:
generate_text('the truth ultimately is')