In [1]:

import torch
import torch.nn as nn
import nltk
import torch.nn.functional as F
from transformers import AutoTokenizer

from gpt.decoder import DecoderOnlyTransformer
from gpt.position_encoder import PositionalEncoding

nltk.download('punkt_tab')

embed_dim = 150
max_len = 75
num_transformers = 6
num_heads = 5
dense_dim = 256
PAD_TOKEN_ID = 0

class NeuralNetwork(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        embed_dim = embed_dim,
        num_transformers = num_transformers,
        num_heads = num_heads,
        dense_dim = dense_dim,
        pad_token_id = PAD_TOKEN_ID
    ):
        super().__init__()
        self.token_embed = nn.Embedding(
            num_embeddings = vocab_size,
            embedding_dim = embed_dim,
            padding_idx = pad_token_id,
        )
        self.position_encoding = PositionalEncoding(
            embed_dim = embed_dim,
            max_len = max_len,
        )
        self.transformer_stack = nn.ModuleList([
            DecoderOnlyTransformer(
                embed_dim = embed_dim,
                num_heads = num_heads,
                dense_dim = dense_dim,
        ) for _ in range(num_transformers)])
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.linear = nn.Linear(
            in_features = embed_dim,
            out_features = vocab_size
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        key_padding_mask = (x == PAD_TOKEN_ID)
        x = self.token_embed(x)
        x = self.position_encoding(x)
        for transformer in self.transformer_stack:
            x = transformer(x, key_padding_mask = key_padding_mask)
        x = self.layer_norm(x)
        x = self.linear(x)
        return x #loss will be computed from logits for stability

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/danieljoo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Inference phase of gpt trained on cluster
The meat of this project like the architecture is in the other notebook. This is solely the inference I get from scaling the model, which in this case means simply running the training on a hpc compute node (that has a relatively high end gpu) as opposed to my mac. The exact hyperparameters were not fine tuned and just guestimated to make the most the compute node's vram and make training last like an hour.

In [10]:
def generate_text(model, tokenizer, prompt, max_length, temperature, pad_token_id):
    model.eval()
    input_ids_list = tokenizer.encode(prompt, truncation=True, max_length=max_length - 1)
    
    # Use the pad_token_id from the model's config
    eos_token_id = tokenizer.eos_token_id or tokenizer.sep_token_id
    
    device = next(model.parameters()).device
    generated_ids = torch.tensor([input_ids_list], device=device, dtype=torch.long)
    
    with torch.no_grad():
        for _ in range(max_length - len(input_ids_list)):
            current_len = generated_ids.size(1)
            # The model was trained with fixed-size input (max_len), so we must pad the input
            padded_input = torch.full((1, max_len), pad_token_id, device=device, dtype=torch.long)
            padded_input[:, :current_len] = generated_ids
            
            logits = model(padded_input)

            # Get the logits for the last token in the sequence
            next_token_logits = logits[:, current_len - 1, :]
            
            scaled_logits = next_token_logits / temperature
            
            probabilities = F.softmax(scaled_logits, dim=-1)
            next_token_id = torch.multinomial(probabilities, num_samples=1)
            generated_ids = torch.cat([generated_ids, next_token_id], dim=1)
            if eos_token_id and next_token_id.item() == eos_token_id:
                break
    generated_text = tokenizer.decode(generated_ids[0].tolist(), skip_special_tokens=True)
    
    return generated_text

model = NeuralNetwork(
    vocab_size=30522,
    embed_dim=150,
    num_heads=5,
    num_transformers=6,
    dense_dim=256,
    pad_token_id=0
)
model.load_state_dict(torch.load("my_trained_gpt_model/model_state_dict.pth", map_location=torch.device('cpu')))
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

device = "cpu"
model.to(device)

NeuralNetwork(
  (token_embed): Embedding(30522, 150, padding_idx=0)
  (position_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_stack): ModuleList(
    (0-5): 6 x DecoderOnlyTransformer(
      (mha): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=150, out_features=150, bias=True)
      )
      (layer_norm1): LayerNorm((150,), eps=1e-05, elementwise_affine=True)
      (layer_norm2): LayerNorm((150,), eps=1e-05, elementwise_affine=True)
      (feed_forward): Sequential(
        (0): Linear(in_features=150, out_features=256, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=256, out_features=150, bias=True)
      )
    )
  )
  (layer_norm): LayerNorm((150,), eps=1e-05, elementwise_affine=True)
  (linear): Linear(in_features=150, out_features=30522, bias=True)
)

In [11]:
prompt = "In 2001,"
print("\n--- Starting Generation ---")
print(f"Prompt: '{prompt}'")

generated_text = generate_text(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_length=50,
    temperature=1,
    pad_token_id=0
)

print(f"Generated text: '{generated_text}'")


--- Starting Generation ---
Prompt: 'In 2001,'
Generated text: 'in 2001, summoned monttry - but condemned ganga throughout the show.'


## Thoughts/Guesses on scaling

If I were to make a hypothesis about how to adjust hyperparameters and use more resources, I would have to increase and manage:
1. Dataset size. I restricted my train data to 50,000 articles which leads to around a million sentences. But that's still only a small fraction of the complete size of the dataset. But of course, iterating until I hit the cap probably would have increased my dataset by 100x but also my training time by 100x, which is impossible without paralleliation/reducing epochs since the max job length is 2 days. It's not related to scaling, but I might also try a different dataset that isn't as formal. For instance, Wikipedia is pretty much entirely third person, which is not a good representation of the kind of language that people bring to chatbots.
2. Parallelized Distributed Training: The compute node comes with up to 4 gpus. That's potentially 4x more data processed. This does require care though because doing something that would be mathematically equivalent to increasing batch size could change how it converges.
3. Learning Rate and Epochs: If I implemented the above, I could probably get away with a faster learning rate since a larger batch would lead to a less noisy gradient. That could reduce the epochs (I'm finding my current configuration to plateau around 20/30 epochs in anyways)
4. Number of attention heads, transformer layers, dense dimension: I think with more data, we can probably increase them but the scale of that increase is not something I have the experience to estimate

So overall, my best guess for how to go about a second attempt would be: parallelize (decrease training time) -> bigger batches (decrease training time) -> faster learning rate (decrease training time) -> fewer epochs (decrease training time) -> more complex model hyperparameters (increase training time) -> more data until I reach the cap (increase training time).

Eh, but that's a lesson for next time, though. Right now, I'm satisfied with these mediocre inputs for my pedagogical bootcamp, and I'll save the experimental testing tools for next time when the problem is more well defined and simple (image classification as opposed to word predictions).