<a href="https://colab.research.google.com/github/eisbetterthanpi/pytorch/blob/main/tinyGPT_nextt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# @title data
# https://github.com/Sam-Armstrong/tinyGPT/blob/main/Training.py
# !pip install torchdata
# !pip install portalocker
import torch
import torch.nn as nn
from torch.utils.data import Dataset


class WikiDataset(Dataset): # https://github.com/karpathy/minGPT
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        vocab_size = len(chars)
        self.stoi = {ch:i for i,ch in enumerate(chars)}
        self.itos = {i:ch for i,ch in enumerate(chars)}
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        chunk = self.data[idx:idx + self.block_size + 1]
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype = torch.long)
        y = torch.tensor(dix[1:], dtype = torch.long)
        return x, y


seq_len = 128
# Code for using a text corpus contained within a .txt file
# data = open('input.txt', 'r').read()
# data = list(data)

from torchtext.datasets import WikiText2
text_iter = WikiText2(split = 'train')
train_data = []
for i, sequence in enumerate(text_iter):
    train_data += list(sequence)
# print(len(train_data)) # 10780437

# train_dataset = WikiDataset(train_data[:10000], seq_len) # one line of poem is roughly 50 characters
train_dataset = WikiDataset(train_data, seq_len) # one line of poem is roughly 50 characters
text_iter = WikiText2(split = 'test')
test_data = []
for i, sequence in enumerate(text_iter):
    test_data += list(sequence)
# print(len(data)) # 10780437
# test_dataset = WikiDataset(test_data[:1000], seq_len) # one line of poem is roughly 50 characters
test_dataset = WikiDataset(test_data, seq_len) # one line of poem is roughly 50 characters

vocab_size = train_dataset.vocab_size
from torch.utils.data.dataloader import DataLoader
batch_size = 512 #500
train_loader = DataLoader(train_dataset, shuffle = True, pin_memory = True, batch_size = batch_size, num_workers = 2) # num_workers = 4
test_loader = DataLoader(test_dataset, shuffle = True, pin_memory = True, batch_size = batch_size, num_workers = 0)





In [9]:
# @title model
import torch
import torch.nn as nn
import math

class SelfAttention(nn.Module): # https://github.com/Sam-Armstrong/tinyGPT/blob/main/SelfAttention.py
    """Multi-Head Self Attention mechanism for use the the transformer model"""
    def __init__(self, seq_len=seq_len, emb_dim = 512, n_heads = 4):
        super().__init__()
        self.emb_dim = emb_dim
        self.n_heads = n_heads
        self.head_dim = self.emb_dim // self.n_heads
        self.values = nn.Linear(self.emb_dim, self.emb_dim, bias = False)
        self.keys = nn.Linear(self.emb_dim, self.emb_dim, bias = False)
        self.queries = nn.Linear(self.emb_dim, self.emb_dim, bias = False)
        self.projection = nn.Linear(self.emb_dim, self.emb_dim, bias = False)
        self.softmax = nn.Softmax(dim = -1)
        self.register_buffer("mask", torch.tril(torch.ones(seq_len, seq_len)).view(1, 1, seq_len, seq_len))

    def forward(self, x):
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        values = self.values(x)
        keys = self.keys(x)
        queries = self.queries(x)
        values = values.reshape(batch_size, seq_len, self.n_heads, self.head_dim)
        keys = keys.reshape(batch_size, seq_len, self.n_heads, self.head_dim)
        queries = queries.reshape(batch_size, seq_len, self.n_heads, self.head_dim)
        similarities = torch.einsum('nqhd,nkhd->nhqk', [queries, keys])
        similarities = similarities.masked_fill(self.mask[:,:,:seq_len,:seq_len] == 0, float('-inf'))
        attention_weights = self.softmax(similarities / math.sqrt(self.emb_dim))
        output = torch.einsum('nhql,nlhd->nqhd', [attention_weights, values]).reshape(batch_size, seq_len, self.emb_dim)
        return self.projection(output)

class Block(nn.Module): # https://github.com/Sam-Armstrong/tinyGPT/blob/main/TransformerBlock.py
    """A single transformer decoder block"""
    def __init__(self, emb_dim=embed_dim):
        super().__init__()
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ln2 = nn.LayerNorm(emb_dim)
        self.attention = SelfAttention(seq_len=seq_len, emb_dim=emb_dim, n_heads=n_heads)
        self.fc1 = nn.Linear(emb_dim, 4 * emb_dim, bias = False)
        self.fc2 = nn.Linear(4 * emb_dim, emb_dim, bias = False)
        self.silu = nn.SiLU()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        # Attention Block
        res = x.clone()
        x = self.ln1(x)
        x = self.attention(x)
        x += res
        # MLP Block
        res = x.clone()
        x = self.ln2(x)
        x = self.fc1(x)
        x = self.silu(x)
        x = self.fc2(x)
        return self.dropout(x)

class tinyGPT(nn.Module): # https://github.com/Sam-Armstrong/tinyGPT/blob/main/Model.py
    def __init__(self, vocab_size, embed_dim, seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Parameter(torch.zeros(1, seq_len, embed_dim))
        self.projection = nn.Linear(embed_dim * 2, embed_dim, bias = False)
        nn.init.normal_(self.projection.weight, mean = 0.0, std = 0.02)
        self.dropout = nn.Dropout(0.1)
        self.blocks = nn.Sequential(*[Block(embed_dim) for _ in range(2)]) # n_layers = 2
        self.ln_out = nn.LayerNorm(embed_dim)
        self.fc_out = nn.Linear(embed_dim, vocab_size, bias = False)
        self.seq_len = seq_len
        # print("number of parameters: " sum(p.numel() for p in self.parameters()))

    def forward(self, idx):
        batch_size = idx.shape[0]
        seq_len = idx.shape[1]
        embedding = self.embedding(idx)
        position_embedding = self.position_embedding[:, :seq_len, :].repeat(batch_size, 1, 1)
        # Concats token and position and embeddings then projects them onto the embedding dimension
        x = torch.concat((embedding, position_embedding), dim = -1)
        x = self.projection(x)
        x = self.dropout(x)
        x = self.blocks(x)
        x = self.ln_out(x)
        return self.fc_out(x)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = tinyGPT(vocab_size, embed_dim, seq_len).to(device)


In [13]:
# @title train test
from torch.nn import functional as F

def test(model, x, max_steps = 512):
    seq_len = model.seq_len
    model.eval()
    print("test",x)
    for n in range(max_steps):
        if x.shape[1] <= seq_len:
            x_bar = x
        else:
            x_bar = x[:, -seq_len:]
        # print("test",x_bar)
        output = model(x_bar)
        # print("output",output)
        output = output[:, -1, :]
        output = F.softmax(output, dim = -1)
        ix = torch.multinomial(output, num_samples = 1)
        x = torch.cat((x, ix), dim = 1)
    return x

from tqdm import tqdm
def train(loader, model, loss_fn, optimizer):
    model.train()
    losses = []
    pbar = tqdm(enumerate(loader), total = len(loader))
    for it, (x, y) in pbar:
        x = x.to(device)
        y = y.to(device)
        with torch.set_grad_enabled(True):
            logits = model(x)
            # loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss = loss_fn(logits, y)
            losses.append(loss.item())
        model.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        # pbar.set_description(f"epoch {epoch + 1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}")
        pbar.set_description(f"epoch {epoch + 1} iter {it}: train loss {loss.item():.5f}.")

import numpy as np
def eval(loader, model, loss_fn):
    model.eval()
    losses = []
    pbar = enumerate(loader)
    for it, (x, y) in pbar:
        x = x.to(device)
        y = y.to(device)
        with torch.set_grad_enabled(False):
            logits = model(x)
            # loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss = loss_fn(logits, y)
            losses.append(loss.item())
    test_loss = float(np.mean(losses))
    # print("test loss: %f", test_loss)
    return test_loss


In [21]:
# @title wwwwwwwwwwww

optimizer = torch.optim.AdamW(model.parameters(), 1e-4, (0.9, 0.95)) # lr = 1e-4 #3e-4
criterion = nn.CrossEntropyLoss()

def loss_fn(logits, y):
    # print(logits.shape, y.shape) # [500, 128, 283], [500, 128]
    return criterion(logits.view(-1, logits.size(-1)), y.view(-1))

for epoch in range(1):
    train(train_loader, model, loss_fn, optimizer)
    test_loss = eval(test_loader, model, loss_fn)
    print('Test Loss:', test_loss)


  0%|          | 0/21561 [00:00<?, ?it/s]

torch.Size([500, 128, 283]) torch.Size([500, 128])


epoch 1 iter 0: train loss 1.17436.:   0%|          | 1/21561 [00:00<5:22:25,  1.11it/s]

torch.Size([500, 128, 283]) torch.Size([500, 128])


epoch 1 iter 1: train loss 1.38244.:   0%|          | 2/21561 [00:01<4:47:28,  1.25it/s]

torch.Size([500, 128, 283]) torch.Size([500, 128])


epoch 1 iter 2: train loss 1.29274.:   0%|          | 3/21561 [00:02<4:37:50,  1.29it/s]

torch.Size([500, 128, 283]) torch.Size([500, 128])


epoch 1 iter 3: train loss 1.27355.:   0%|          | 3/21561 [00:03<6:11:54,  1.04s/it]


KeyboardInterrupt: ignored

In [15]:
# @title inference
context = "This is what "
#context = 'There are many things about horses that have been discovered in recent'
print([train_dataset.stoi[s] for s in context])
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype = torch.long)[None,...].to(device)
y = test(model, x)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)



[53, 72, 73, 83, 1, 73, 83, 1, 87, 72, 65, 84, 1]
test tensor([[53, 72, 73, 83,  1, 73, 83,  1, 87, 72, 65, 84,  1]], device='cuda:0')
This is what was driven out that they fiy have personed to ice a potential reference broadcast be ready gquarter for souphers . He was an also stignal field on the widely going atoms and features are often in the BC — the first fund will to send a throw skill that to 20 cm ( 130 ft ) , respectively ; the other : no — her culture week , though he theory <unk> mayor , enough threaten than owly armor . The budget his reasons when the other and performed every deal is a lengthy ait for most not completely paid that the poor


In [19]:
context = "Where the "
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype = torch.long)[None,...].to(device)
# print("test",x)

seq_len = model.seq_len
model.eval()
for n in range(512):
    if x.shape[1] <= seq_len:
        x_bar = x
    else:
        x_bar = x[:, -seq_len:]
    # print("test",x_bar.shape) # [1, len(contex)+] int to 277+
    output = model(x_bar)
    # print("output",output.shape) # [1, len(contex)+, vocab_size=283] float
    output = output[:, -1, :] #get logit for last character
    output = F.softmax(output, dim = -1) #vocab_size to char
    ix = torch.multinomial(output, num_samples = 1)
    x = torch.cat((x, ix), dim = 1)

y=x[0]
# print(y)
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)


Where the Southern songs , choosing nigalley strictly launched an independent six of <unk> . Yaruz , but Walpole had been the hurricane of pink for <unk> on Solis Cricket Give Mission ruled the First Range . 
 
 = = = History of United = = = 
 
 
 = = Rew = = 
 
 
 = Direct responded standard internal studios existence on January 3 , 2002 , the Da <unk> Stals mhor to the practice of Snake , and used the remarks of video game 's The Gold <unk> . This reaction that Blu rest a large put the site afternoon his felt . <un


In [20]:

# path = "/content/drive/MyDrive/frame/tinyGPT.pth"
# torch.save(model.state_dict(), path)
# # model.load_state_dict(torch.load(path))
