# **Building plato GPT from scrach using tensor flow**

In [59]:


from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/plato.txt"
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

print("length of dataset in characters:", len(text))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
length of dataset in characters: 263400


In [60]:
lines = text.splitlines()

# print 5
for line in lines[:5]:
    print(line)

﻿I went down yesterday to the Piraeus with Glaucon the son of Ariston, that I might offer up my prayers to the goddess (Bendis, the Thracian Artemis.); and also because I wanted to see in what manner they would celebrate the festival, which was a new thing. I was delighted with the procession of the inhabitants; but that of the Thracians was equally, if not more, beautiful. When we had finished our prayers and viewed the spectacle, we turned in the direction of the city; and at that instant Polemarchus the son of Cephalus chanced to catch sight of us from a distance as we were starting on our way home, and told his servant to run and bid us wait for him. The servant took hold of me by the cloak behind, and said: Polemarchus desires you to wait.
I turned round, and asked him where his master was.
There he is, said the youth, coming after you, if you will only wait.
Certainly we will, said Glaucon; and in a few minutes Polemarchus appeared, and with him Adeimantus, Glaucon’s brother, Nic

**# world embedding**

In [61]:
!pip install sentencepiece




In [62]:
import sentencepiece as spm
import os



model_prefix = "plato_bpe"


spm.SentencePieceTrainer.train(
    input=file_path,
    model_prefix=model_prefix,
    vocab_size=8000,        #
    model_type='bpe',       #
    character_coverage=1.0, #
    input_sentence_size=100000,
    shuffle_input_sentence=True
)




In [63]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load(f"{model_prefix}.model")

test_str = "Socrates: Let us discuss justice."
tokens = sp.encode(test_str, out_type=str)
print("tokens：", tokens)

print("BPE size：", sp.vocab_size())


tokens： ['▁Socrates', ':', '▁Let', '▁us', '▁discus', 's', '▁justice', '.']
BPE size： 8000


In [64]:
import tensorflow as tf
import numpy as np

# transfer
with open(file_path, "r", encoding="utf-8") as f:
    text_data = f.read()

# tokenize
encoded_ids = sp.encode(text_data, out_type=int)
print("Token总数:", len(encoded_ids))

#numpy
encoded_ids = np.array(encoded_ids, dtype=np.int32)


Token总数: 57064


set up training data

In [65]:
# how many tokens in a sensence
seq_len = 64

# make sure //
total_tokens = len(encoded_ids)
num_subsequences = total_tokens // (seq_len + 1)
trimmed_size = num_subsequences * (seq_len + 1)

# get rid of //
encoded_ids = encoded_ids[:trimmed_size]

# 3.2 reshape
# 我们把它 reshape 成 [num_subsequences, seq_len+1]
subsequences = encoded_ids.reshape((num_subsequences, seq_len + 1))

# 3.3  tf.data.Dataset
def split_input_target(seq):

    input_seq = seq[:-1]
    target_seq = seq[1:]
    return input_seq, target_seq

BATCH_SIZE = 32
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices(subsequences)
dataset = dataset.map(split_input_target)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

print("Dataset structure：", dataset.element_spec)


Dataset structure： (TensorSpec(shape=(32, 64), dtype=tf.int32, name=None), TensorSpec(shape=(32, 64), dtype=tf.int32, name=None))


# **# multi_head**

In [66]:
class CausalSelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.projection_dim = embed_dim // num_heads

        self.query_dense = tf.keras.layers.Dense(embed_dim)
        self.key_dense   = tf.keras.layers.Dense(embed_dim)
        self.value_dense = tf.keras.layers.Dense(embed_dim)
        self.out_dense   = tf.keras.layers.Dense(embed_dim)

    def call(self, x):

        batch_size = tf.shape(x)[0]
        seq_len    = tf.shape(x)[1]

        # Q, K, V
        q = self.query_dense(x)
        k = self.key_dense(x)
        v = self.value_dense(x)

        # reshape: (batch, seq_len, num_heads, projection_dim)
        q = tf.reshape(q, (batch_size, seq_len, self.num_heads, self.projection_dim))
        k = tf.reshape(k, (batch_size, seq_len, self.num_heads, self.projection_dim))
        v = tf.reshape(v, (batch_size, seq_len, self.num_heads, self.projection_dim))

        # : (batch, num_heads, seq_len, projection_dim)
        q = tf.transpose(q, [0, 2, 1, 3])
        k = tf.transpose(k, [0, 2, 1, 3])
        v = tf.transpose(v, [0, 2, 1, 3])

        # get weight
        scale = tf.cast(self.projection_dim, tf.float32) ** 0.5
        logits = tf.matmul(q, k, transpose_b=True) / scale  # (batch, heads, seq_len, seq_len)

        # mask
        mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)  # 下三角=1，上三角=0
        mask = tf.reshape(mask, (1, 1, seq_len, seq_len))               # (1,1,seq_len,seq_len)

        # prepare for softmax
        logits = logits * mask + (1.0 - mask) * -1e9

        weights = tf.nn.softmax(logits, axis=-1)     # (batch, heads, seq_len, seq_len)
        attention_output = tf.matmul(weights, v)     # (batch, heads, seq_len, projection_dim)

        #
        attention_output = tf.transpose(attention_output, [0, 2, 1, 3])
        #  (batch, seq_len, num_heads, projection_dim)

        concat_output = tf.reshape(attention_output, (batch_size, seq_len, self.embed_dim))

        #  Dense
        out = self.out_dense(concat_output)  # (batch, seq_len, embed_dim)
        return out


# tramsfomer

In [67]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.1):
        super().__init__()
        self.att = CausalSelfAttention(embed_dim, num_heads)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.ff = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation='relu'),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, training=False):
        #self attention
        attn_output = self.att(x)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.norm1(x + attn_output)

        # feed foward
        ff_output = self.ff(out1)
        ff_output = self.dropout2(ff_output, training=training)
        out2 = self.norm2(out1 + ff_output)

        return out2


In [68]:
class SimpleGPT(tf.keras.Model):
    def __init__(self,
                 vocab_size,
                 max_seq_len=64,
                 embed_dim=128,
                 num_heads=4,
                 ff_dim=256,
                 num_layers=2,
                 dropout_rate=0.1):
        super().__init__()
        # word embedding
        self.token_embed = tf.keras.layers.Embedding(vocab_size, embed_dim)
        # position embedding
        self.pos_embed   = tf.keras.layers.Embedding(input_dim=max_seq_len, output_dim=embed_dim)

        #  TransformerBlock from class TransformerBlock(tf.keras.layers.Layer)
        self.blocks = [
            TransformerBlock(embed_dim, num_heads, ff_dim, dropout_rate)
            for _ in range(num_layers)
        ]

        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.fc_out  = tf.keras.layers.Dense(vocab_size)
        self.max_seq_len = max_seq_len

    def call(self, x, training=False):

        batch_size = tf.shape(x)[0]
        seq_len    = tf.shape(x)[1]

        # 1) token embedding
        token_embeddings = self.token_embed(x)

        # 2) positional embedding
        positions = tf.range(0, seq_len, dtype=tf.int32)[tf.newaxis, :]  # (1, seq_len)
        pos_embeddings = self.pos_embed(positions)                       # (1, seq_len, embed_dim)

        x_embed = token_embeddings + pos_embeddings
        x_embed = self.dropout(x_embed, training=training)

        # TransformerBlock
        for block in self.blocks:
            x_embed = block(x_embed, training=training)

        # feed foward
        logits = self.fc_out(x_embed)  # (batch, seq_len, vocab_size)
        return logits


In [69]:
#
vocab_size = sp.vocab_size()
#build model
model = SimpleGPT(
    vocab_size=vocab_size,
    max_seq_len=seq_len,
    embed_dim=128,
    num_heads=4,
    ff_dim=256,
    num_layers=2,
    dropout_rate=0.1
)

# loss function
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(optimizer=optimizer, loss=loss_fn)

model.summary()


In [70]:
# train
EPOCHS = 50
history = model.fit(dataset, epochs=EPOCHS)


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - loss: 8.1263
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6.0619
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 5.8476
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 5.3690
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.9267
Epoch 6/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.6328
Epoch 7/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.4197
Epoch 8/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.1943
Epoch 9/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.0204
Epoch 10/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3.8369
Epoch 11

In [74]:
def generate_text_bpe(model, sp_model, start_string, max_new_tokens=100, temperature=0.8):
    """
    Arguments:
        model: The trained GPT model (TensorFlow).
        sp_model: The SentencePieceProcessor (BPE model).
        start_string: The initial text prompt for generation.
        max_new_tokens: The number of tokens to generate.
        temperature: Controls randomness in sampling (>1.0 => more random, <1.0 => more conservative).
    Returns:
        output_text: The final generated text string.
    """
    # Convert the start string to token IDs (ensuring out_type=int)
    input_ids = sp_model.encode(start_string, out_type=int)

    for _ in range(max_new_tokens):
        # Expand to a batch of size 1
        x = tf.expand_dims(input_ids, 0)  # shape: (1, current_seq_len)

        # Forward pass through the model
        logits = model(x)  # shape: (1, current_seq_len, vocab_size)
        logits = logits[:, -1, :]  # Take the logits at the last time step
        logits = logits / temperature  # Scale by temperature

        # Convert logits to probabilities and sample
        probs = tf.nn.softmax(logits, axis=-1)
        next_token_id = tf.random.categorical(tf.math.log(probs), num_samples=1)[0, 0].numpy()

        # Ensure it is an integer before appending
        next_token_id = int(next_token_id)
        input_ids.append(next_token_id)

    # (Optional) Print for debugging:
    # print("input_ids:", input_ids)
    # print("Element types:", [type(e) for e in input_ids])

    # Decode the final sequence of token IDs back into text
    input_ids = [int(i) for i in input_ids]  # Ensure all are ints
    output_text = sp_model.decode_ids(input_ids)
    return output_text


In [75]:
prompt = "what is justice? "
gen_text = generate_text_bpe(model, sp, prompt, max_new_tokens=50, temperature=0.8)
print("result：\n", gen_text)


result：
 what is justice? I am in your duty: There is to be in what you would very true, if you say, if you say, if we were discovered what nature, if we may be the horse or any other States which is to have failed to know what


# **Use pre-train model to get better results**