<a href="https://colab.research.google.com/github/eisbetterthanpi/transformer/blob/main/charGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title setup
!pip install torchdata
!pip install portalocker

In [1]:
# @title data
# https://github.com/Sam-Armstrong/tinyGPT/blob/main/Training.py
# https://colab.research.google.com/github/karpathy/minGPT/blob/master/play_char.ipynb

import torch
import torch.nn as nn
from torch.utils.data import Dataset

class CharDataset(Dataset): # https://github.com/karpathy/minGPT
    def __init__(self, raw_data, block_size):
        data = ''.join(raw_data)
        chars = sorted(list(set(data)))
        self.vocab_size = len(chars) # 283
        self.stoi = {ch:i for i,ch in enumerate(chars)}
        self.itos = {i:ch for i,ch in enumerate(chars)}
        self.data = self.data_process(data) # list of int
        self.block_size = block_size

    def data_process(self, data): # str 10780437
        return torch.tensor([self.stoi.get(c) for c in data]) # list of int 4570571 # stoi.get(c,UNK_IDX)

    def __len__(self):
        return len(self.data) - self.block_size
        # return len(self.data)//self.block_size

    def __getitem__(self, idx):
        dix = self.data[idx:idx + self.block_size + 1]
        x, y = dix[:-1], dix[1:]
        return x, y

# data = list(open('input.txt', 'r').read()) # for using a text corpus contained within a .txt file
from torchtext.datasets import WikiText2
train_iter, val_iter, test_iter = WikiText2() # line by line of wiki  = Valkyria Chronicles III =
seq_len = 128
train_dataset = CharDataset(train_iter, seq_len) # one line of poem is roughly 50 characters
test_dataset = CharDataset(test_iter, seq_len) # one line of poem is roughly 50 characters
from torch.utils.data.dataloader import DataLoader
batch_size = 512 #512
train_loader = DataLoader(train_dataset, shuffle = True, pin_memory = True, batch_size = batch_size, num_workers = 2) # num_workers = 4
test_loader = DataLoader(test_dataset, shuffle = True, pin_memory = True, batch_size = batch_size, num_workers = 0)

def encode(context): return torch.tensor([train_dataset.stoi.get(c) for c in context], device=device)
def decode(x): return ''.join([train_dataset.itos[int(i)] for i in x])
# for x,y in train_loader:
#     break
# n=2
# print(decode(x[n]))
# print(decode(y[n]))


In [7]:
# @title model from from scratch
import torch
import torch.nn as nn
import math
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_seq_length=512):
        super(PositionalEncoder, self).__init__()
        self.drop = nn.Dropout(dropout)
        pe = torch.zeros(max_seq_length, d_model)
        pos = torch.arange(0, max_seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.drop(x + self.pe[:, : x.size(1)])


class LearntPosEnc(nn.Module): # learnt positional embeddings
    def __init__(self, d_model, dropout=0.1, max_length=512):
        super(LearntPosEnc, self).__init__()
        self.pos_embedding = nn.Embedding(max_length, d_model)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, src_len = x.shape
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device) # [batch size, src len]
        return self.drop(x + self.pos_embedding(pos))


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.q = nn.Linear(d_model, d_model, bias=False)
        self.k = nn.Linear(d_model, d_model, bias=False)
        self.v = nn.Linear(d_model, d_model, bias=False)
        self.lin = nn.Linear(d_model, d_model)
        self.drop = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.tensor((self.head_dim,), dtype=torch.float, device=device))

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]
        Q = self.q(query).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
        K = self.k(key).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
        V = self.v(value).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
        attn = Q @ K.transpose(2, 3) / self.scale # attn = torch.matmul(Q, K.transpose(2, 3)) / self.scale
        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e10)
        attention = torch.softmax(attn, dim=-1)
        x = self.drop(attention) @ V # x = torch.matmul(self.drop(attention), V)
        x = x.transpose(1, 2).reshape(batch_size, -1, self.d_model)
        x = self.lin(x)
        return x, attention


class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(dropout)
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout=0)
        self.ff = nn.Sequential(
            nn.Linear(d_model, ff_dim), nn.ReLU(), # ReLU GELU SiLU
            nn.Dropout(dropout), nn.Linear(ff_dim, d_model)
        )

    def forward(self, src, src_mask):
        src = self.norm1(src + self.drop(self.self_attn(src, src, src, src_mask)[0]))
        src = self.norm2(src + self.drop(self.ff(src)))
        return src

class Encoder(nn.Module):
    def __init__(self, d_model, n_layers, n_heads, ff_dim, dropout=0.1):
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(n_layers)])

    def forward(self, src, src_mask):
        for layer in self.layers:
            src = layer(src, src_mask)
        return src


class Gpt(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, n_layers=3, ff_dim=512, dropout=0.1):
        super(Gpt, self).__init__()
        self.encoder = Encoder(d_model, n_layers, nhead, ff_dim, dropout)
        self.pos_enc = PositionalEncoder(d_model, dropout=dropout)
        # self.pos_enc = LearntPosEnc(d_model, dropout=dropout)
        self.src_tok_emb = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model
        self.lin = nn.Linear(d_model, vocab_size)
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    # def forward(self, src, trg, src_mask=None, trg_mask=None):
    def forward(self, src, src_mask=None):
        src = self.pos_enc(self.src_tok_emb(src) * math.sqrt(self.d_model))
        enc_src = self.encoder(src, src_mask)
        output = self.lin(enc_src)
        return output

def make_trg_mask(trg):
    # trg_pad_mask = (trg != PAD_IDX).unsqueeze(1).unsqueeze(2).to(device)
    trg_len = trg.shape[1]
    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=device)).bool()
    # trg_mask = trg_pad_mask & trg_sub_mask
    # return trg_mask
    return trg_sub_mask


vocab_size = train_dataset.vocab_size
# model = Seq2Seq(in_dim, out_dim, d_model=512, nhead=8, enc_layers=3, dec_layers=3, ff_dim=512, dropout=0.1).to(device)
model = Gpt(vocab_size, d_model=512, nhead=4, n_layers=2, ff_dim=2048, dropout=0.1).to(device)


In [None]:
# @title wandb
# https://docs.wandb.ai/quickstart
!pip install wandb
import wandb
wandb.login() # 487a2109e55dce4e13fc70681781de9f50f27be7
run = wandb.init(
    project="tiny_gpt",
    config={
        "model": "adam 1e-3",
        "optim": "adam",
        # "learning_rate": 5,
    })


In [4]:
# @title train test generate
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
scaler = torch.cuda.amp.GradScaler()

def strain(model, dataloader, optimizer, loss_fn, scheduler=None): # train function with automatic mixed precision
    model.train()
    total_loss = 0.
    for batch, (data, targets) in enumerate(dataloader):
        data, targets = data.to(device), targets.to(device)
        with torch.cuda.amp.autocast(): # automatic mixed percision
            mask = make_trg_mask(data)
            logits = model(data, mask)
            loss = loss_fn(logits.reshape(-1, logits.size(-1)), targets.flatten()) # [512, 128, 283], [512, 128]
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        scaler.step(optimizer)
        scaler.update()
        # if scheduler is not None: scheduler.step()
        # print("strain",loss.item())
        total_loss += loss.item()
        try: wandb.log({"train loss": loss.item()/len(targets)})
        except NameError: pass
    return total_loss / len(dataloader)


from tqdm import tqdm
def train(loader, model, loss_fn, optimizer):
    model.train()
    total_loss = 0
    pbar = tqdm(enumerate(loader), total = len(loader))
    for it, (x, y) in pbar:
    # for it, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)
        mask = make_trg_mask(x)
        logits = model(x, mask)
        loss = loss_fn(logits.reshape(-1, logits.size(-1)), y.flatten()) # [512, 128, 283], [512, 128]
        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        total_loss+=loss.item()
        try: wandb.log({"train loss": loss.item()/len(y)})
        except NameError: pass
        pbar.set_description(f"epoch {epoch + 1} iter {it}: train loss {loss.item():.5f}.")
    return total_loss / len(loader)

def test(loader, model, loss_fn):
    model.eval()
    total_loss = 0
    for it, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)
        with torch.no_grad():
            logits = model(x)
        loss = loss_fn(logits.reshape(-1, logits.size(-1)), y.flatten())
        total_loss+=loss.item()
    return total_loss / len(loader)


from torch.nn import functional as F
def generate(model, context, max_steps = 512):
    x = torch.tensor([train_dataset.stoi.get(c) for c in context], device=device)
    model.eval()
    for n in range(max_steps):
        if x.shape[0] <= seq_len: x_bar = x
        else: x_bar = x[-seq_len:]
        output = model(x_bar.unsqueeze(0)) # [1, len(contex)+, vocab_size=283] float
        output = output[:, -1, :] # get logit for last character
        output = F.softmax(output, dim = -1) # vocab_size to char
        ix = torch.multinomial(output, num_samples = 1) # rand sample by output distribution
        x = torch.cat((x, ix.flatten()))
    completion = ''.join([train_dataset.itos[int(i)] for i in x])
    return completion



In [None]:
# @title run
# AdamW 1e-4 1e-3
# sgd 1e-3
optimizer = torch.optim.AdamW(model.parameters(), 1e-3, (0.9, 0.95)) # lr = 1e-4 #3e-4
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) # 5. , 0.001
criterion = nn.CrossEntropyLoss()
# print("lr: ", optimizer.param_groups[0]['lr'])
# optimizer.param_groups[0]['lr']=1e-3

for epoch in range(1):
    train_loss = strain(model, train_loader, optimizer, criterion, scheduler=None)
    # train_loss = train(train_loader, model, criterion, optimizer)
    test_loss = test(test_loader, model, criterion)
    print('Test Loss:', test_loss)


In [10]:
# @title inference
context = "This is what "
#context = 'There are many things about horses that have been discovered in recent'
completion = generate(model, context)
print(completion)


This is what they had with the pass from theis to State with the title audio ... The capturity is a too experience of <unk> defeates and their <unk> and marked Street Sports High School basses . 
 As a result , <unk> is experiences for more what pris , occurs thers , sounds room attracted casting , fully fourtebally new produced thereance sitcom for their laught advance @-@ production is finally complete . Together , which cared four erect based may ther enough septifiable between ther ther thant and two Scandinaviana l


In [11]:
# @title save
path = "/content/drive/MyDrive/frame/tinyGPTscratchAdamW1e-3.pth"

torch.save(model.state_dict(), path)

# model.load_state_dict(torch.load(path, map_location=device))


In [53]:
torch.cuda.empty_cache()

### trash

In [None]:
# @title og tinyGPT
import torch
import torch.nn as nn
import math

class SelfAttention(nn.Module): # https://github.com/Sam-Armstrong/tinyGPT/blob/main/SelfAttention.py
    """Multi-Head Self Attention mechanism for use the the transformer model"""
    def __init__(self, seq_len, emb_dim, n_heads):
        super().__init__()
        self.emb_dim = emb_dim
        self.n_heads = n_heads
        self.head_dim = self.emb_dim // self.n_heads
        self.values = nn.Linear(self.emb_dim, self.emb_dim, bias = False)
        self.keys = nn.Linear(self.emb_dim, self.emb_dim, bias = False)
        self.queries = nn.Linear(self.emb_dim, self.emb_dim, bias = False)
        self.projection = nn.Linear(self.emb_dim, self.emb_dim, bias = False)
        self.softmax = nn.Softmax(dim = -1)
        self.register_buffer("mask", torch.tril(torch.ones(seq_len, seq_len)).view(1, 1, seq_len, seq_len))

    def forward(self, x):
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        values = self.values(x)
        keys = self.keys(x)
        queries = self.queries(x)
        values = values.reshape(batch_size, seq_len, self.n_heads, self.head_dim)
        keys = keys.reshape(batch_size, seq_len, self.n_heads, self.head_dim)
        queries = queries.reshape(batch_size, seq_len, self.n_heads, self.head_dim)
        similarities = torch.einsum('nqhd,nkhd->nhqk', [queries, keys])
        similarities = similarities.masked_fill(self.mask[:,:,:seq_len,:seq_len] == 0, float('-inf'))
        attention_weights = self.softmax(similarities / math.sqrt(self.emb_dim))
        output = torch.einsum('nhql,nlhd->nqhd', [attention_weights, values]).reshape(batch_size, seq_len, self.emb_dim)
        return self.projection(output)

class Block(nn.Module): # https://github.com/Sam-Armstrong/tinyGPT/blob/main/TransformerBlock.py
    """A single transformer decoder block"""
    def __init__(self, seq_len, emb_dim, n_heads):
        super().__init__()
        self.attn_blk = nn.Sequential(nn.LayerNorm(emb_dim),
            SelfAttention(seq_len, emb_dim, n_heads))
        self.mlp_blk = nn.Sequential(nn.LayerNorm(emb_dim),
            nn.Linear(emb_dim, 4 * emb_dim, bias = False), nn.SiLU(),
            nn.Linear(4 * emb_dim, emb_dim, bias = False), nn.Dropout(0.1),)

    def forward(self, x):
        x = x + self.attn_blk(x) # Attention Block
        x = x + self.mlp_blk(x) # MLP Block
        return x

class tinyGPT(nn.Module): # https://github.com/Sam-Armstrong/tinyGPT/blob/main/Model.py
    def __init__(self, vocab_size, emb_dim, seq_len, n_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding = nn.Parameter(torch.zeros(1, seq_len, emb_dim))
        self.projection = nn.Linear(emb_dim * 2, emb_dim, bias = False)
        nn.init.normal_(self.projection.weight, mean = 0.0, std = 0.02)
        self.dropout = nn.Dropout(0.1)
        self.blocks = nn.Sequential(*[Block(seq_len, emb_dim, n_heads) for _ in range(2)]) # n_layers = 2
        self.ln_out = nn.LayerNorm(emb_dim)
        self.fc_out = nn.Linear(emb_dim, vocab_size, bias = False)
        # self.seq_len = seq_len
        # print("number of parameters: " sum(p.numel() for p in self.parameters()))

    def forward(self, idx):
        batch_size, seq_len = idx.shape # [4, 128]
        embedding = self.embedding(idx)
        position_embedding = self.position_embedding[:, :seq_len, :].repeat(batch_size, 1, 1)
        # Concats token and position and embeddings then projects them onto the embedding dimension
        x = torch.concat((embedding, position_embedding), dim = -1)
        x = self.projection(x)
        x = self.dropout(x)
        x = self.blocks(x)
        x = self.ln_out(x)
        return self.fc_out(x)

vocab_size = train_dataset.vocab_size
emb_dim = 512
n_heads = 4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = tinyGPT(vocab_size, emb_dim, seq_len, n_heads).to(device)
# d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,

# bias=F in blocks adn fc_out?

