In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Input, Embedding, LayerNormalization, Dropout
import numpy as np
from tensorflow import keras

2024-12-30 10:15:23.124449: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735533923.137999  545636 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735533923.141689  545636 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-30 10:15:23.156907: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
with open('training_data.txt', 'r', encoding='utf-8') as f:
    data = f.read().replace('\n', ' ')

In [3]:
print(len(data))

1115394


In [4]:
characters=list(set(list(data)))
print(len(characters))

64


In [5]:
character_to_integer_encoding={}
integer_to_character_encoding={}
for i in range(len(characters)):
    character_to_integer_encoding[characters[i]]=i+1
    integer_to_character_encoding[i+1]=characters[i]

In [6]:
def encode(string):
    global character_to_integer_encoding    
    return [character_to_integer_encoding[char] for char in string]

def decode(lst):
    global integer_to_character_encoding
    return ''.join([integer_to_character_encoding[i] for i in lst])

In [7]:
input_data=encode(data)
train_data=input_data[:int(0.9*len(input_data))]
test_data=input_data[int(0.9*len(input_data)):]

In [8]:
batch_size=32
block_size=128
num_heads=8 # Experiment with other values if you want
num_transformer_blocks = 4
input_vocab_size=len(characters)+1
feed_forward_dim = 256 # I am using the same dimensions for the embedding as well. This may be too high of a dimension, given that there are only 65 characters and 128 positions per block, but it will take a lot of time to test alternate parameters

In [9]:
def causal_attention_mask(batch_size, n_dest, n_src):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, tf.bool)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        # Give code for an attention layer, feedforward layers, and normalization layers. The attention layer is first, then normalization and dropout, then forward the data passed through a non-linear function, and call the dropout layer again
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.feed_forward_network = Sequential(
            [layers.Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.normalization_layer_1 = LayerNormalization(epsilon=1e-6)
        self.normalization_layer_2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        block_size = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, block_size, block_size)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.normalization_layer_1(inputs + attention_output)
        feed_forward_output = self.feed_forward_network(out1)
        feed_forward_output = self.dropout2(feed_forward_output)
        return self.normalization_layer_2(out1 + feed_forward_output)

In [10]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_embedding = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_embedding(positions)
        x = self.token_embedding(x)
        return x + positions


In [11]:
class Transformer(Model):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, feed_forward_dim, num_transformer_blocks):
        super().__init__()
        self.inputs = Input(shape=(maxlen,), dtype=tf.int32)
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.embedding_dim = embed_dim
        self.num_transformer_blocks = num_transformer_blocks
        self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, feed_forward_dim) for _ in range(num_transformer_blocks)]
        self.dense = Dense(vocab_size)

    def call(self, inputs):
        x = self.embedding_layer(inputs)
        for i in range(self.num_transformer_blocks):
            x = self.transformer_blocks[i](x)
        output = self.dense(x)
        return output

    
def get_transformer_model(
    maxlen, 
    vocab_size, 
    embed_dim, 
    num_heads, 
    feed_forward_dim, 
    num_transformer_blocks=1
):
    inputs = Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    for i in range(num_transformer_blocks):
        transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
        x = transformer_block(x)
    outputs = Dense(vocab_size)(x)
    model = Model(inputs=inputs, outputs=[outputs])
    return model

In [None]:
model = get_transformer_model(
    block_size, 
    input_vocab_size, 
    feed_forward_dim, 
    num_heads, 
    feed_forward_dim, 
    num_transformer_blocks
)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    "adam", 
    loss=[loss_fn],
    metrics=["accuracy"]
)

2024-12-30 10:15:31.340749: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [13]:
inputs = [train_data[i:i+block_size] for i in range(0, len(train_data)-block_size-1)]
targets = [train_data[i+1:i+block_size+1] for i in range(0, len(train_data)-block_size-1)]

inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=block_size, padding='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=block_size, padding='post')

inputs = tf.convert_to_tensor(inputs, dtype=tf.int64)
targets = tf.convert_to_tensor(targets, dtype=tf.int64)

dataset= tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset = dataset.shuffle(10000)
dataset = dataset.batch(batch_size, drop_remainder=True)

2024-12-30 10:15:52.501877: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1027814400 exceeds 10% of free system memory.
2024-12-30 10:15:53.591806: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1027814400 exceeds 10% of free system memory.


In [14]:
model.summary()

In [32]:
dataset= tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset=dataset.shuffle(1000)
dataset = dataset.batch(batch_size, drop_remainder=True)
model.fit(dataset, epochs=10)

Epoch 1/10
[1m11106/31366[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m13:55:19[0m 2s/step - accuracy: 0.7607 - loss: 0.8876

KeyboardInterrupt: 

In [34]:
def generate_text(model, start_index, num_generate=1):
    # Ensure train_data[start_index:start_index + block_size] is properly shaped
    input_sequence = train_data[start_index:start_index + block_size]
    generated_text = decode(input_sequence)
    probabilistic_text = decode(input_sequence)
    for i in range(num_generate):
        input_eval = tf.convert_to_tensor([input_sequence], dtype=tf.int32)
        predictions = model.predict(input_eval)
        probabilities = tf.nn.softmax(predictions[0, -1]).numpy()
        next_token = np.random.choice(len(probabilities), p=probabilities)
        next_token = np.argmax(probabilities)
        input_sequence += [next_token]
        input_sequence = input_sequence[1:]
        generated_text += decode([next_token])

    return generated_text


In [35]:
generate_text(model, start_index=len(train_data)-block_size, num_generate=1000)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48

" I will go sit and weep Till I can find occasion of revenge.  BAPTISTA: Was ever gentleman thus grieved as I? But who comes hereouins he a our amserviey, Where: Have thou enemong and do hearth's doy unce By for thy hy fair hearth's son, dead, it to the louch thousay, Have enone thou houngay, him dis carder of Herefords our a clossoon; His endsights all But by brope a poor To by his good By heal by To lastey of wells, you good Gaunt disstoy of well, Have the to have to live. I move thou disd of which the make of head you hereous, our a your a good God Gaunt disd that compares see by Hereford's rights Will griefored good Or evereT his live and suing inston. Thousater not had the to had His roody heartient that heart thou do spiers not our a doth To By homself a good But by be By be majy.  DUKE OF YORK: I'll nobher my liege, in hom and have by to Hereford: Hereford: God you say a good His rights friel will the and suicccess not His frights instorle To that this more suck bleased Haryorse

In [37]:
model.save_weights('transformer_model.weights.h5')

In [40]:
model.save("saved_model.keras")