In [1]:
# Common imports 
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Necessary hyperparameters

BATCH_SIZE = 128
BLOCK_SIZE = 256 
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_EMBED = 256
N_HEADS = 4
DROPOUT = 0.2
N_LAYER = 5
LEARNING_RATE = 3e-3
MAX_ITERS = 10000
EVAL_INTERVAL = 500
EVAL_ITERS = 200

#### Dataset, encoder and decoder creation and datasplits

In [3]:
# Just add the local dataset path
with open('./40k.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Unique characters that occur in this text
chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)

# Map the characters
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l]) 

# Get data splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) 
train = data[:n]
val = data[n:]

In [4]:
def get_batch(split, n_pred_tokens=4):
    """
    Generate a small batch of data for training or validation.

    Parameters:
    split (str): The dataset split to use, either 'train' or 'val'.
    n_pred_tokens (int): The number of future tokens to predict. Default is 4.
    """
    data = train if split == 'train' else val
    ix = torch.randint(len(data) - BLOCK_SIZE - n_pred_tokens + 1, (BATCH_SIZE,))
    x = torch.stack([data[i:i + BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i+1:i + BLOCK_SIZE + n_pred_tokens] for i in ix])
    x, y = x.to(DEVICE), y.to(DEVICE)
    
    return x, y

In [5]:
@torch.no_grad()
def estimate_loss():
    """
    Estimate the loss for both training and validation datasets.

    This function evaluates the model in evaluation mode to estimate the loss
    without updating the model parameters. It calculates the average loss over
    a number of evaluation iterations for both the training and validation splits.

    Returns:
    dict: A dictionary containing the average loss for 'train' and 'val' splits.
    """
    out = {}
    model.eval()  # Set the model to evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            logits, loss = model(X, Y) 
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

#### Attention block

In [6]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(N_EMBED, head_size, bias=False) # head_size x N_EMBED
        self.query = nn.Linear(N_EMBED, head_size, bias=False)
        self.value = nn.Linear(N_EMBED, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))

        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        # input of size (B, T, C)
        # output of size (B, T, head_size)
        B,T,C = x.shape
        #  x = B,T,C = 16, 256, 512,  self.key = 64 x 512, linear performs xAT+b,
        # since nn.Linear inverts the order we have to transpose self.key to get 16 x 256 x 512 @ 512 x 64
        k = self.key(x)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T) dim=-1 means that the softmax function is applied along the last dimension of the tensor.
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, N_EMBED) # 512 x 512
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(DROPOUT),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

#### Transformers blocks

In [7]:
class GPTLanguageModel(nn.Module):
    def __init__(self, num_pred_tokens=4):  
        super().__init__()
        self.num_pred_tokens = num_pred_tokens
        self.token_embedding_table = nn.Embedding(VOCAB_SIZE, N_EMBED)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBED)
        self.blocks = nn.Sequential(*[Block(N_EMBED, n_head=N_HEADS) for _ in range(N_LAYER)])
        self.ln_f = nn.LayerNorm(N_EMBED)
        self.lm_heads = nn.ModuleList([nn.Linear(N_EMBED, VOCAB_SIZE) for _ in range(num_pred_tokens)])
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # Token and position embeddings
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=DEVICE))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # (B, T, C)
        x = self.ln_f(x)  # (B, T, C)
        
        # Apply each head
        logits = [head(x) for head in self.lm_heads]
        logits = torch.stack(logits, dim=2)  # Stack over the num_pred_tokens dimension: (B, T, num_pred_tokens, vocab_size)
        
        if targets is None:
            loss = None
        else: 
            B, T, N, C = logits.shape
            losses = []
            for i in range(N):
                head_logits = logits[:, :, i, :].reshape(B * T, C)
                head_targets = targets[:, i:i + T].reshape(B * T) 
                losses.append(F.cross_entropy(head_logits, head_targets))
            loss = torch.mean(torch.stack(losses))
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -BLOCK_SIZE:]

            # get the predictions (logits from the first head only)
            logits, _ = self(idx_cond)  # logits: (B, T, num_pred_tokens, vocab_size)

            # Use only the first head for token generation
            logits_head = logits[:, -1, 0, :]  # (B, vocab_size)

            # Apply softmax to get probabilities for the next token
            probs = F.softmax(logits_head, dim=-1)  # (B, vocab_size)

            # Sample from the distribution for the next token
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)

            # Append the sampled token to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T + 1)

        return idx

#### Functions to save and reload the trainned model

In [8]:
def save_checkpoint(model, optimizer, filename="model.pth"):
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)
    
    
def load_checkpoint(model, optimizer, filename="model.pth"):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    return model, optimizer

#### Trainning loop

In [93]:
model = GPTLanguageModel()
m = model.to(DEVICE)

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

for iter in range(MAX_ITERS):

    # every once in a while evaluate the loss on train and val sets
    if iter % EVAL_INTERVAL == 0 or iter == MAX_ITERS - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

4.131704 M parameters
step 0: train loss 4.5979, val loss 4.5989
step 500: train loss 2.6752, val loss 2.7004
step 1000: train loss 2.3533, val loss 2.3999
step 1500: train loss 2.1757, val loss 2.2367
step 2000: train loss 2.0959, val loss 2.1594
step 2500: train loss 2.0519, val loss 2.1288
step 3000: train loss 2.0194, val loss 2.0969
step 3500: train loss 1.9993, val loss 2.0843
step 4000: train loss 1.9808, val loss 2.0644
step 4500: train loss 1.9678, val loss 2.0588
step 5000: train loss 1.9527, val loss 2.0475
step 5500: train loss 1.9457, val loss 2.0369
step 6000: train loss 1.9378, val loss 2.0340
step 6500: train loss 1.9282, val loss 2.0293
step 7000: train loss 1.9215, val loss 2.0268
step 7500: train loss 1.9168, val loss 2.0227
step 8000: train loss 1.9083, val loss 2.0175
step 8500: train loss 1.9085, val loss 2.0212
step 9000: train loss 1.8992, val loss 2.0119
step 9500: train loss 1.8955, val loss 2.0107
step 10000: train loss 1.8915, val loss 2.0072
step 10500: tra

We got a pretty decent loss. This model was trainned for 50000 iterations, we could have trainned it more, however, it was taking longer and longer to decrease the trainning loss so i decided the stop trainning and check out the results. 

We i trainned the vanilla model, it was trainned with the same 50000 iters but we got a loss of approximately 1.20. It was a smaller loss however it does not mean a better result. We are going to confirm this latter.  

In [None]:
# Save the model after the training loop
save_checkpoint(model, optimizer, filename="multitoken_40k_40000.pth")

#### Load the model for further trainning 

As mentioned above, the model was trainned for 50000 iters, but we did not do this trainning all at once as the model was trainned on Kaggle.

In [42]:
# This cell loads the model again for further trainning 

model = GPTLanguageModel()
checkpoint = torch.load("/kaggle/input/multitoken_30000/pytorch/default/1/multitoken_40k_30000.pth", map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])
# Move the model to the desired device
model.to(DEVICE)

# Recreate the optimizer after moving the model to the desired device
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Move optimizer state to the same device as model
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.to(DEVICE)

# Set the model to training mode
model.train()  # Important for training

GPTLanguageModel(
  (token_embedding_table): Embedding(94, 256)
  (position_embedding_table): Embedding(256, 256)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=256, out_features=64, bias=False)
            (query): Linear(in_features=256, out_features=64, bias=False)
            (value): Linear(in_features=256, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=

#### Separate trainning loop.

In [None]:
for iter in range(MAX_ITERS):

    if iter % EVAL_INTERVAL == 0 or iter == MAX_ITERS - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# Save the model
save_checkpoint(model, optimizer, filename="multitoken_40k_50000.pth")

#### Inference code

The inference was made on a cpu because i do not have a GPU, that's what the parameter **map_location=torch.device('cpu')** is for. If you have a GPU, delete this parameter. 

In [9]:
# Load the model for inference
model = GPTLanguageModel()
optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

# Load the saved model and optimizer
checkpoint = torch.load("./multitoken_40k_50000.pth", map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Move the model to the desired device
model.to(DEVICE)

# Set the model to eval mode
model.eval()

  checkpoint = torch.load("./multitoken_40k_50000.pth", map_location=torch.device('cpu'))


GPTLanguageModel(
  (token_embedding_table): Embedding(94, 256)
  (position_embedding_table): Embedding(256, 256)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=256, out_features=64, bias=False)
            (query): Linear(in_features=256, out_features=64, bias=False)
            (value): Linear(in_features=256, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=

In [10]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device = DEVICE)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))
#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


He fell forward – Kircher – the multi-crosses—delighted, blurring
across the far side of the line, gracking him in the sterm-seeded hill. He
tossed back the floor into his helm, put the others with gone more than his
rising fumes. Y’s cramping hidden pieces of drop-pods, defanding. Though
that had once been an embers of a nearspace chamber, Octavia’s skull devasivated, he
required a perfectly tasked acting screen and trying to jump once more, for
the first time he had left to advance. A kilometr
