In [1]:
# Common imports 
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
BATCH_SIZE = 128 
BLOCK_SIZE = 256 
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_EMBED = 192
N_HEADS = 3
DROPOUT = 0.2
N_LAYER = 4
LEARNING_RATE = 3e-3
MAX_ITERS = 10000
EVAL_INTERVAL = 500
EVAL_ITERS = 200

# This notebook was last run in a Kaggle environment, so you must replace it
with open('./40k.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Unique characters that occur in this text
chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)

# Map the characters
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l]) 

# Get data splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) 
train = data[:n]
val = data[n:]

In [3]:
def get_batch(split, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE):
    """
    Get a batch of data for training or testing.
    """

    data = train if split == 'train' else val
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x_batch = []
    y_batch = []

    for i in ix:
        x_batch.append(data[i:i+block_size])
        y_batch.append(data[i+1:i+1+block_size])

    x = torch.stack(x_batch)
    y = torch.stack(y_batch)
    x, y = x.to(DEVICE), y.to(DEVICE)

    return x, y

In [8]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(N_EMBED, head_size, bias=False) # head_size x N_EMBED
        self.query = nn.Linear(N_EMBED, head_size, bias=False)
        self.value = nn.Linear(N_EMBED, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))

        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        # input of size (B, T, C)
        # output of size (B, T, head_size)
        B,T,C = x.shape
        #  x = B,T,C = 16, 256, 512,  self.key = 64 x 512, linear performs xAT+b,
        # since nn.Linear inverts the order we have to transpose self.key to get 16 x 256 x 512 @ 512 x 64
        k = self.key(x)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T) dim=-1 means that the softmax function is applied along the last dimension of the tensor.
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, N_EMBED) # 512 x 512
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(DROPOUT),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(VOCAB_SIZE, N_EMBED)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBED)
        self.blocks = nn.Sequential(*[Block(N_EMBED, n_head=N_HEADS) for _ in range(N_LAYER)])
        self.ln_f = nn.LayerNorm(N_EMBED) # final layer norm
        self.lm_head = nn.Linear(N_EMBED, VOCAB_SIZE) # language model head

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=DEVICE)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -BLOCK_SIZE:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    


In [4]:
# Save the model checkpoint
def save_checkpoint(model, optimizer, filename="model.pth"):
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)

In [5]:
def load_checkpoint(model, optimizer, filename="model.pth"):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    return model, optimizer

In [30]:
model = GPTLanguageModel()
m = model.to(DEVICE)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

for iter in range(MAX_ITERS):

    # every once in a while evaluate the loss on train and val sets
    if iter % EVAL_INTERVAL == 0 or iter == MAX_ITERS - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

1.862878 M parameters
step 0: train loss 4.6108, val loss 4.6065
step 500: train loss 1.9488, val loss 1.9710
step 1000: train loss 1.6022, val loss 1.6476
step 1500: train loss 1.4443, val loss 1.4870
step 2000: train loss 1.3807, val loss 1.4308
step 2500: train loss 1.3388, val loss 1.3927
step 3000: train loss 1.3182, val loss 1.3747
step 3500: train loss 1.2949, val loss 1.3538
step 4000: train loss 1.2708, val loss 1.3366
step 4500: train loss 1.2538, val loss 1.3225
step 5000: train loss 1.2517, val loss 1.3243
step 5500: train loss 1.2416, val loss 1.3165
step 6000: train loss 1.2329, val loss 1.3083
step 6500: train loss 1.2270, val loss 1.3082
step 7000: train loss 1.2214, val loss 1.3010
step 7500: train loss 1.2114, val loss 1.2918
step 8000: train loss 1.2114, val loss 1.2924
step 8500: train loss 1.2044, val loss 1.2849
step 9000: train loss 1.2012, val loss 1.2838
step 9500: train loss 1.1992, val loss 1.2820
step 10000: train loss 1.2043, val loss 1.2909
step 10500: tra

In [11]:
# Save the model after the training loop
save_checkpoint(model, optimizer, filename="model_40k_50000.pth")

In [7]:
# This cell loads the model again for further trainning 

model = GPTLanguageModel()
checkpoint = torch.load("/kaggle/input/40k-weights/model_40k_40000.pth", map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])
# Move the model to the desired device
model.to(DEVICE)

# Recreate the optimizer after moving the model to the desired device
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Move optimizer state to the same device as model
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.to(DEVICE)

# Set the model to training mode
model.train()  # Important for training

GPTLanguageModel(
  (token_embedding_table): Embedding(94, 192)
  (position_embedding_table): Embedding(256, 192)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-2): 3 x Head(
            (key): Linear(in_features=192, out_features=64, bias=False)
            (query): Linear(in_features=192, out_features=64, bias=False)
            (value): Linear(in_features=192, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=192, out_features=768, bias=True)
          (1): ReLU()
          (2): Linear(in_features=768, out_features=192, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((192,), eps=1e-05, elementwise_affine=Tr

In [9]:
for iter in range(MAX_ITERS):

    if iter % EVAL_INTERVAL == 0 or iter == MAX_ITERS - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 1.1009, val loss 1.2202
step 500: train loss 1.1001, val loss 1.2207
step 1000: train loss 1.1011, val loss 1.2207
step 1500: train loss 1.1006, val loss 1.2219
step 2000: train loss 1.1009, val loss 1.2190
step 2500: train loss 1.1008, val loss 1.2229
step 3000: train loss 1.0975, val loss 1.2144
step 3500: train loss 1.1000, val loss 1.2228
step 4000: train loss 1.0995, val loss 1.2233
step 4500: train loss 1.0987, val loss 1.2244
step 5000: train loss 1.0977, val loss 1.2194
step 5500: train loss 1.0962, val loss 1.2179
step 6000: train loss 1.0984, val loss 1.2234
step 6500: train loss 1.0967, val loss 1.2198
step 7000: train loss 1.0996, val loss 1.2217
step 7500: train loss 1.0998, val loss 1.2233
step 8000: train loss 1.0956, val loss 1.2178
step 8500: train loss 1.0967, val loss 1.2201
step 9000: train loss 1.0989, val loss 1.2246
step 9500: train loss 1.0957, val loss 1.2192
step 9999: train loss 1.0936, val loss 1.2208


In [12]:
# Define constants
LEARNING_RATE = 3e-3
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the model for inference
model = GPTLanguageModel()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# Load the saved model and optimizer with map_location to handle CPU-only environments
checkpoint = torch.load("./model_40k_1000.pth", map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Move the model to the desired device
model.to(DEVICE)

# Set the model to eval mode
model.eval()

GPTLanguageModel(
  (token_embedding_table): Embedding(94, 192)
  (position_embedding_table): Embedding(256, 192)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-2): 3 x Head(
            (key): Linear(in_features=192, out_features=64, bias=False)
            (query): Linear(in_features=192, out_features=64, bias=False)
            (value): Linear(in_features=192, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=192, out_features=768, bias=True)
          (1): ReLU()
          (2): Linear(in_features=768, out_features=192, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((192,), eps=1e-05, elementwise_affine=Tr

In [13]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device = DEVICE)
print(decode(model.generate(context, max_new_tokens=1500)[0].tolist()))
#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


                      , mindirstanquive ammark: 1]

‘They’re going to suns. We got the smilen,’ said Dalia, following his own drewks.
  ‘They morn armour join
acceptions cook,’ Arukally, stotting the desourches, have grassion to the milerch blood his
unformations the scond that second a dead and trying as and collually the name age side
gastrical none to the surmeching its lost his anohing of the skign.
‘Somethin..’
Seture Mavitz. ‘You cruitor’s that will have of as a first in the bell psychang
out it by the shield had know has corge any open the tracken, the daman precept
Perecis deady Calthy, the pleas as moment.
This alied twith hae a face roddly and chip rolled by her the engistoned of our around god
the fing? She is with a smile woods overit against blue onwhilled
Old and with Twitcherions the Winderal and his spuning. It we’d neediater our
a directions.
 The was was his fight linefections not their unfore, with the glowing pursed
theses their liads of dalers and darks and not go