In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras import Model

In [3]:
class MultiHeadSelfAtention(Layer):
    def __init__(self,embed_dim,num_heads):
      super().__init__()
      self.embed_dim = embed_dim
      self.num_heads = num_heads
      self.attn_head_size = embed_dim // num_heads

      self.wq = Dense(embed_dim)
      self.wk = Dense(embed_dim)
      self.wv = Dense(embed_dim)

      self.dense = Dense(embed_dim)


    def split_heads(self,x,batch_size):
      x = tf.reshape(x, (batch_size, -1, self.num_heads, self.attn_head_size))
      return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self,q,k,v,mask):
      batch_size = tf.shape(q)[0]

      q = self.split_heads(self.wq(q), batch_size)
      k = self.split_heads(self.wk(k), batch_size)
      v = self.split_heads(self.wv(v), batch_size)

      matmul_qk = tf.matmul(q, k, transpose_b = True)
      dk = tf.cast(tf.shape(k)[-1], tf.float32)
      scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

      if mask is not None:
        scaled_attention_logits += (mask * -1e9)

      attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
      output = tf.matmul(attention_weights, v)
      output = tf.transpose(output, perm=[0,2,1,3])
      concat_attention = tf.reshape(output, (batch_size, -1,self.embed_dim))
      output = self.dense(concat_attention)
      return output


In [4]:
class FeedForwardNetwork(Layer):
  def __init__(self, embed_dim, dff) -> None:
    super().__init__()
    self.dense1 = Dense(dff, activation='gelu')
    self.dense2 = Dense(embed_dim)

  def call(self, x):
    return self.dense2(self.dense1(x))

In [5]:
class TransformerBlock(Layer):
  def __init__(self, embed_dim, num_heads, dff, dropout_rate=0.1) -> None:
    super().__init__()
    self.att = MultiHeadSelfAtention(embed_dim, num_heads)
    self.ffn = FeedForwardNetwork(embed_dim, dff)
    self.norm1 = LayerNormalization(epsilon=1e-6)
    self.norm2 = LayerNormalization(epsilon=1e-6)
    self.dropout1 = Dropout(dropout_rate)
    self.dropout2 = Dropout(dropout_rate)


  def call(self, x, training, mask=None):
    attn_output = self.att(x,x,x,mask)
    attn_output = self.dropout1(attn_output)
    out1 = self.norm1(x + attn_output)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output)
    return self.norm2(out1 + ffn_output)


In [6]:
class GPT2(Model):
  def __init__(self, vocab_size, max_length, embed_dim = 768, num_heads = 12, dff = 3072, num_layer = 12, dropout_rate = 0.1):
    super().__init__()

    self.token_embed = Embedding(vocab_size, embed_dim)
    self.position_embed = Embedding(max_length, embed_dim)

    self.transposed_blocks = [TransformerBlock(embed_dim, num_heads, dff, dropout_rate) for _ in range(num_layer)]

    self.norm = LayerNormalization(epsilon=1e-6)
    self.dense = Dense(vocab_size)

  def create_casual_mask(self, seq_len):
    mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    return mask

  def call(self, x, training=False, mask=None):
    seq_len = tf.shape(x)[1]
    mask = self.create_casual_mask(seq_len)

    token_embeddings = self.token_embed(x)
    pos_embeddings = self.position_embed(tf.range(seq_len)[:, tf.newaxis])
    x = token_embeddings + pos_embeddings

    for transformer in self.transposed_blocks:
      x = transformer(x, training=training, mask=mask)

    x = self.norm(x)
    return self.dense(x)

In [7]:
VOCAB_SIZE = 50257
MAX_LENGTH = 1024

inputs = tf.keras.layers.Input(shape=(MAX_LENGTH,),dtype=tf.int32)
outputs = GPT2(VOCAB_SIZE, MAX_LENGTH)(inputs)
gpt2 = Model(inputs, outputs)

gpt2.build(input_shape=(1, MAX_LENGTH))

gpt2.summary()

In [None]:
import numpy as np

# Create vocabulary from your training text
training_text = "your training corpus here..."
chars = sorted(set(training_text))
vocab_dict = {char: idx for idx, char in enumerate(chars)}
reverse_vocab = {idx: char for char, idx in vocab_dict.items()}

def tokenize(text, vocab, max_length):
    """Convert text to token IDs"""
    tokens = [vocab.get(char, 0) for char in text[:max_length]]
    tokens += [0] * (max_length - len(tokens))
    return np.array(tokens, dtype=np.int32)

def detokenize(tokens, reverse_vocab):
    """Convert token IDs back to text"""
    return ''.join([reverse_vocab.get(t, '') for t in tokens if t != 0])


prompt = "what is Neural networks?"
input_tokens = tokenize(prompt, vocab_dict, MAX_LENGTH)
input_tokens = np.expand_dims(input_tokens, 0)  # Correct: axis 0 for batch


predictions = gpt2.predict(input_tokens, verbose=0)
output_tokens = np.argmax(predictions, axis=-1)[0]
generated_text = detokenize(output_tokens, reverse_vocab)
print(generated_text)
