In [1]:
from MyTorch import Model, Tensor
from MyTorch.activations import Softmax, GeLU
from MyTorch.layers import Linear, Dropout, MultiheadAttention, LayerNorm, Embedding
import numpy as np

class GPT2_Layer(Model):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.ln1 = LayerNorm(d_model)
        self.attn = MultiheadAttention(d_model, num_heads, dropout)
        self.drop1 = Dropout(dropout)
        self.ln2 = LayerNorm(d_model)
        self.ff1 = Linear(d_model, d_ff)
        self.gelu1 = GeLU()
        self.ff2 = Linear(d_ff, d_model)
        self.drop2 = Dropout(dropout)

    def forward(self, x, attention_mask):
        o_x = x
        x = self.ln1(x)
        x = self.attn(x, x, x, attention_mask)
        x = self.drop1(x)
        x = x + o_x
        x = self.ln2(x)
        o_x = x
        x = self.ff1(x)
        x = self.gelu1(x)
        x = self.ff2(x)
        x = self.drop2(x)
        x = x + o_x
        return x

class GPT2(Model):
    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout, vocab_size, max_len=1024):
        super().__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.pos_embedding = Embedding(max_len, d_model)
        self.layers = [ GPT2_Layer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers) ]
        self.ln1 = LayerNorm(d_model)
        self.final_linear = Linear(d_model, vocab_size)
        self.softmax = Softmax(dim=2)
        
    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        pos_ids = Tensor(np.arange(x.shape[1]), requires_grad=False)
        pos_embedding = self.pos_embedding(pos_ids)
        x = x + pos_embedding
        for layer in self.layers:
            x = layer(x, attention_mask)
        x = self.ln1(x)
        x = self.final_linear(x)
        x = self.softmax(x)
        return x
    
class TextGenerator:
    def __init__(self, gpt, tokenizer):
        super().__init__()
        self.gpt = gpt
        self.tokenizer = tokenizer
        
    def generate_yield(self, input_str: str, max_len):
        input_ids = self.tokenizer.encode(input_str)
        input_ids = Tensor(input_ids)
        input_ids = input_ids.reshape(1, -1)
        attention_mask = Tensor(np.ones((1, input_ids.shape[1])))
        for _ in range(max_len):
            output = self.gpt(input_ids, attention_mask)
            output = np.argmax(output.data, axis=-1)
            output = output[:, -1:]
            input_ids = Tensor(np.concatenate([input_ids.data, output], axis=1))
            attention_mask = Tensor(np.ones((1, input_ids.shape[1])))
            token = input_ids.data.astype(np.int32).reshape(-1)
            new_str = self.tokenizer.decode(token)
            yield new_str
            
            

            
    def generate(self, input_str: str, max_len) -> str:
        return list(self.generate_yield(input_str, max_len))[-1]
    
    def generate_yield_word(self, input_str: str, max_len):
        current_str = input_str
        for new_str in self.generate_yield(input_str, max_len):
            word = new_str[len(current_str):]
            current_str = new_str
            yield word


In [2]:
def dataset_generator(dataset):
    for i, row in enumerate(dataset):
        if i > 2:
            break
        for j in range(0, len(row["input_ids"])):
            yield {"input_ids": row["input_ids"][:j+1], "attention_mask": row["attention_mask"][:j+1]}

def get_split_text_dataset(dataset):
    from tqdm.auto import tqdm
    from datasets import Dataset
    input_ids = []
    attention_masks = []
    for row in tqdm(dataset):
        for i in range(0, len(row["input_ids"])):
            input_ids.append(row["input_ids"][:i+1])
            attention_masks.append(row["attention_mask"][:i+1])
    
    dataset = Dataset.from_dict({"input_ids": input_ids, "attention_mask": attention_masks})  
    return dataset

In [3]:
from transformers import DataCollatorWithPadding, AutoTokenizer
import datasets
tokenizer = AutoTokenizer.from_pretrained("gpt2")
dataset = datasets.load_dataset("wikitext", "wikitext-103-v1")
dataset = dataset.filter(lambda x: len(x["text"]) < 1024 and len(x["text"]) > 20)
dataset = dataset.filter(lambda x: x["text"].encode("ascii", "ignore").decode() == x["text"])
dataset = dataset.map(lambda x: tokenizer(x["text"]))
train_dataset = datasets.Dataset.from_generator(lambda: dataset_generator(dataset["train"]))
val_dataset = datasets.Dataset.from_generator(lambda: dataset_generator(dataset["validation"]))


collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", return_tensors='np')


In [4]:
from transformers import DataCollatorWithPadding
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", return_tensors='np')



In [5]:
# GPT2 pretrain
from transformers import AutoTokenizer
from MyTorch import Tensor
from MyTorch.optimizers import SGD
from MyTorch.losses import CrossEntropy

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
gpt2 = GPT2(2, 768, 12, 1024, 0.1, tokenizer.vocab_size)
optimizer = SGD(gpt2.get_parameters(), lr=0.001)
loss_fn = CrossEntropy()

epoches = 10
batch_size = 32

def train_step(gpt2: GPT2, tokenizer: AutoTokenizer, optimizer, loss_fn, input_ids, attention_masks):
    intput_ids_tensor = Tensor(input_ids, requires_grad=False)
    attention_masks_tensor = Tensor(attention_masks, requires_grad=False)
    
    output = gpt2(intput_ids_tensor, attention_masks_tensor)
    
    target = input_ids[:, 1:]
    # to one hot
    target = Tensor(np.eye(tokenizer.vocab_size)[target.astype(np.int32)], requires_grad= False)
    output = output[:, :-1]
    loss_value = loss_fn(output, target)
    # cross entropy.shape = (batch, seq_len, vocab_size)
    print(loss_value)
    loss_value.backward()
    optimizer.step()
    optimizer.zero_grad()
    



In [6]:
import gc
gc.collect()
train_dataset = train_dataset.shuffle()
for epoch in range(epoches):
    for i in range(0, len(train_dataset), batch_size):
        batch = train_dataset[i:i+batch_size]
        batch = collate_fn(batch)
        train_step(gpt2, tokenizer, optimizer, loss_fn, batch["input_ids"], batch["attention_mask"])
        gc.collect()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Tensor(1270.5062255859375, shape = ())
Tensor(1613.028564453125, shape = ())
Tensor(1736.511962890625, shape = ())
Tensor(1803.906982421875, shape = ())
Tensor(1555.4896240234375, shape = ())
Tensor(1669.2515869140625, shape = ())
Tensor(1638.6746826171875, shape = ())
Tensor(2996.677001953125, shape = ())
Tensor(1765.8671875, shape = ())
Tensor(1723.6947021484375, shape = ())
Tensor(1852.72119140625, shape = ())
Tensor(1903.543701171875, shape = ())
Tensor(1598.879150390625, shape = ())
Tensor(1677.8656005859375, shape = ())
Tensor(1631.4163818359375, shape = ())
Tensor(2953.852294921875, shape = ())
Tensor(1740.3922119140625, shape = ())
Tensor(1696.293701171875, shape = ())
Tensor(1879.939453125, shape = ())
Tensor(1943.898681640625, shape = ())
Tensor(1640.6026611328125, shape = ())
Tensor(1739.9024658203125, shape = ())
Tensor(1695.6492919921875, shape = ())
Tensor(3065.319580078125, shape = ())
Tensor(1813.2979736328125, shape = ())
Tensor(1762.5687255859375, shape = ())
Tensor(1