In [38]:
from MyTorch import Model, Tensor
from MyTorch.activations import ReLU, Softmax, GeLU
from MyTorch.layers import Linear, Dropout, MultiheadAttention, LayerNorm, Embedding
import numpy as np

class GPT2_Layer(Model):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.ln1 = LayerNorm(d_model)
        self.attn = MultiheadAttention(d_model, num_heads, dropout)
        self.drop1 = Dropout(dropout)
        self.ln2 = LayerNorm(d_model)
        self.ff1 = Linear(d_model, d_ff)
        self.gelu1 = GeLU()
        self.ff2 = Linear(d_ff, d_model)
        self.drop2 = Dropout(dropout)

    def forward(self, x, attention_mask):
        o_x = x
        x = self.ln1(x)
        x = self.attn(x, x, x, attention_mask)
        x = self.drop1(x)
        x = x + o_x
        x = self.ln2(x)
        o_x = x
        x = self.ff1(x)
        x = self.gelu1(x)
        x = self.ff2(x)
        x = self.drop2(x)
        x = x + o_x
        return x

class GPT2(Model):
    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout, vocab_size, max_len=1024):
        super().__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.pos_embedding = Embedding(max_len, d_model)
        self.layers = [ GPT2_Layer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers) ]
        self.ln1 = LayerNorm(d_model)
        self.final_linear = Linear(d_model, vocab_size)
        self.softmax = Softmax(dim=2)
        
    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        for layer in self.layers:
            x = layer(x, attention_mask)
        x = self.ln1(x)
        x = self.final_linear(x)
        x = self.softmax(x)
        return x
    
class TextGenerator:
    def __init__(self, gpt, tokenizer):
        super().__init__()
        self.gpt = gpt
        self.tokenizer = tokenizer
        
    def generate_yield(self, input_str: str, max_len):
        input_ids = self.tokenizer.encode(input_str)
        input_ids = Tensor(input_ids)
        input_ids = input_ids.reshape(1, -1)
        attention_mask = Tensor(np.ones((1, input_ids.shape[1])))
        for _ in range(max_len):
            output = self.gpt(input_ids, attention_mask)
            output = np.argmax(output.data, axis=-1)
            output = output[:, -1:]
            input_ids = Tensor(np.concatenate([input_ids.data, output], axis=1))
            attention_mask = Tensor(np.ones((1, input_ids.shape[1])))
            token = input_ids.data.astype(np.int32).reshape(-1)
            new_str = self.tokenizer.decode(token)
            yield new_str
            
            

            
    def generate(self, input_str: str, max_len) -> str:
        return list(self.generate_yield(input_str, max_len))[-1]
    
    def generate_yield_word(self, input_str: str, max_len):
        current_str = input_str
        for new_str in self.generate_yield(input_str, max_len):
            word = new_str[len(current_str):]
            current_str = new_str
            yield word


In [41]:
from transformers import AutoTokenizer
from MyTorch import Tensor

tokenizer = AutoTokenizer.from_pretrained("gpt2")
token = tokenizer(["Hello, I'm a single sentence!"])
token = { k: Tensor(v) for k, v in token.items() }
gpt2 = GPT2(2, 768, 12, 1024, 0.1, tokenizer.vocab_size)
gpt2.eval()

generator = TextGenerator(gpt2, tokenizer)

for i in generator.generate_yield_word("Hello, I'm a single sentence!", 10):
    print(i, end="")

 Fulton held productivity consumeronian Lin HRopausal planners TM