In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset


torch.manual_seed(42)

  cpu = _conversion_method_template(device=torch.device("cpu"))


<torch._C.Generator at 0x1751b803050>

In [12]:
# GPT parameters
from dataclasses import dataclass

@dataclass
class GPTConfig:
    blockSize: int = 512 # max text/sequence length
    batchSize: int = 12
    nLayer: int = 12
    headSize: int = 12
    nEmbD:int = 768 # embedding dim(ension)
    hiddenDim = nEmbD // headSize
    dropout : float = 0.1

    # gpt2 official tokenizer
    vocabSize: int = 50257

In [10]:
# GPT struct definition

import math

class SingleHeaderAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.key = nn.Linear(config.hiddenDim, config.headSize)
        self.value = nn.Linear(config.hiddenDim, config.headSize)
        self.query = nn.Linear(config.hiddenDim, config.headSize)

        self.register_buffer(
            "attention_mask",
            torch.ones((config.blockSize, config.blockSize))
        )
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        batchSize, sequenceLen, hiddenDim = x.size()
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        weight = q @ k.transpose(-2, -1) # @ is torch.matmul
        weight = weight.masked_fill(
            self.attention_mask[:sequenceLen, :sequenceLen] == 0,
            float("-inf")
        )

        # weight need to divide by sqrt of hiddenDim
        weight  = F.softmax(weight ,  dim=-1) / math.sqrt(self.headSize)

        # dropout after weight
        weight = self.dropout(weight)
        output = weight @ v
        return output


class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.heads = nn.ModuleList(
            [
                SingleHeaderAttention(config) for _ in range(config.headSize)
            ]
        )
        self.proj = nn.Linear(config.hiddenDim, config.hiddenDim)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, x):
        output = torch.cat([h(x) for h in self.heads], dim=-1)
        output = self.proj(output)
        output = self.dropout(output)
        return output

# DNN (MLP)
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config.hiddenDim, 4 * config.hiddenDim),
            nn.GELU(),
            nn.Linear(4 * config.hiddenDim, config.hiddenDim),
            nn.Dropout(config.dropout)
        )

    def forward(self, x):
        return self.net(x)

# Block
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = MultiHeadAttention(config) # multi head attention
        self.ffn  = FeedForward(config)
        # layer normal
        self.ln1 =nn.LayerNorm(config.hiddenDim)
        self.ln2 = nn.LayerNorm(config.hiddenDim)
    
    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x


# GPT
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        # embedding, position, norm, mlp , block
        

        self.tokenEmbeddingTable = nn.Embedding(config.vocabSize, config.nEmbD)
        self.positionEmbeddingTable = nn.Embedding(config.blockSize, config.nEmbD)
        self.blocks = nn.Sequential(
            *[Block(config) for _ in range(config.nLayer)]
        )
        self.lnFinal = nn.LayerNorm(config.nEmbD)
        self.lmHead = nn.Linear(config.nEmbD, config.vocabSize, bias=False)

        # tie weight
        self.tokenEmbeddingTable.weight = self.lmHead.weight

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx  token ids
        # targets: next token ids
        # shape will be same 
        batch, sequenceLen = idx.size()
        tokenEmb = self.tokenEmbeddingTable(idx)
        positionEmb = self.positionEmbeddingTable(
            torch.arange(sequenceLen, device=idx.device)
        )

        x = tokenEmb + positionEmb
        x = self.blocks(x)
        x = self.lnFinal(x)
        logits = self.lmHead(x)
        if targets is None:
            loss = None
        else:
            batch, sequenceLen, vocabSize = logits.size()
            logits = logits.view(batch* sequenceLen, vocabSize)
            targets = targets.view(batch * sequenceLen)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, maxNewTokens):
        #TODO 
        pass





In [7]:

import tiktoken

# GPT Dataset
class MyDataset(Dataset):
    def __init__(self, path, block_size= 512):
        self.enc = tiktoken.get_encoding("gpt2")
        self.block_size = block_size # pos max length


        # special token to divided different train text
        self.eos_token = self.enc.encode(
            "<|endoftext|>",
            allow_special = ("< |endoftext|>",)
        )[0]


        import json
        self.maxLines = 1000

        rawData = []
        with open(path, "r") as f:
            for l in enumerate(f):
                if i>= self.maxLines:
                    break
                try:
                    text = json.loads(l.strip())["text"]
                    rawData.append(text)
                except:
                    continue
        
        fullEncoding = []
        for text in rawData:
            encodedText = self.enc.encode(text)
            fullEncoding.extend(encodedText + [self.eos_token])
        
        for i in range(0, len(fullEncoding), self.block_size):
            chunk = fullEncoding[i: i + self.block_size+1] # 512 as default then 513
            if len(chunk) < self.block_size + 1:
                chunk = chunk + [self.eos_token] * (self.block_size + 1 - len(chunk))
            self.encodedData.append(chunk)


    def __len__(self):
        return len(self.encodedData)

    def __getitem__(self, index):
        chunk = self.encodedData[index]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x,y
    
    def encode(self, text):
        return self.enc.encode(text)
    
    def decode(self, tokens):
        return self.enc.decode(tokens)

In [None]:
model = GPT(GPTConfig())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

# learning rate scheduler : cosine
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 1000, eta_min=1e-6)

39,779,136 total parameters.
