## GPT-3 Paper Info

### Model Architectures and Hyper-Parameters

| Model Name              | nparams | nlayers | dmodel | nheads | dhead | Batch Size | Learning Rate  |
|-------------------------|---------|---------|--------|--------|-------|------------|----------------|
| GPT-3 Small             | 125M    | 12      | 768    | 12     | 64    | 0.5M       | 6.0 × 10−4    |
| GPT-3 Medium            | 350M    | 24      | 1024   | 16     | 64    | 0.5M       | 3.0 × 10−4    |
| GPT-3 Large             | 760M    | 24      | 1536   | 16     | 96    | 0.5M       | 2.5 × 10−4    |
| GPT-3 XL                | 1.3B    | 24      | 2048   | 24     | 128   | 1M         | 2.0 × 10−4    |
| GPT-3 2.7B              | 2.7B    | 32      | 2560   | 32     | 80    | 1M         | 1.6 × 10−4    |
| GPT-3 6.7B              | 6.7B    | 32      | 4096   | 32     | 128   | 2M         | 1.2 × 10−4    |
| GPT-3 13B               | 13.0B   | 40      | 5140   | 40     | 128   | 2M         | 1.0 × 10−4    |
| GPT-3 175B or “GPT-3”    | 175.0B  | 96      | 12288  | 96     | 128   | 3.2M       | 0.6 × 10−4    |

**Table 2.1:** Sizes, architectures, and learning hyper-parameters (batch size in tokens and learning rate) of the models
which we trained. All models were trained for a total of 300 billion tokens.


**Table 2.1** shows the sizes and architectures of our 8 models. Here nparams is the total number of trainable parameters,
nlayers is the total number of layers, dmodel is the number of units in each bottleneck layer (we always have the
feedforward layer four times the size of the bottleneck layer, dff = 4 ∗ dmodel), and dhead is the dimension of each
attention head. All models use a context window of nctx = 2048 tokens. We partition the model across GPUs along
both the depth and width dimension in order to minimize data-transfer between nodes. The precise architectural
parameters for each model are chosen based on computational efficiency and load-balancing in the layout of models
across GPU’s. Previous work [KMH+20 ] suggests that validation loss is not strongly sensitive to these parameters
within a reasonably broad range.

#### B Details of Model Training

To train all versions of GPT-3, we use **Adam** with **β1 = 0.9**, **β2 = 0.95**, and **ε = 10⁻⁸**, clip the global norm of the gradient at **1.0**, and apply **cosine decay** for the learning rate, reducing it to **10%** of its value over **260 billion tokens** (after which training continues at 10% of the original rate). There is a **linear learning rate warmup** over the first **375 million tokens**, and the batch size is gradually increased from **32k tokens** to the full value over the first **4–12 billion tokens** of training, depending on model size. Data are sampled without replacement until an epoch boundary is reached to minimize overfitting, and all models use a **weight decay of 0.1** for regularization. During training, we always use sequences of the full **2048-token context window**, packing multiple documents into a single sequence when documents are shorter than 2048, with a special **end of text token** delimiting documents to efficiently indicate that separated contexts are unrelated.


In [None]:
import getpass, os, torch, glob, re, time

from transformers import GPT2LMHeadModel, get_scheduler
from torch.utils.data import IterableDataset, DataLoader
from huggingface_hub import hf_hub_download, HfApi, create_repo, Repository
from huggingface_hub.utils import HfHubHTTPError

class PTIterableDataset(IterableDataset):
    def __init__(self, pt_files):
        self.pt_files = pt_files

    def __iter__(self):
        for file_path in self.pt_files:
            data = torch.load(file_path)
            for i in range(data["input_ids"].size(0)):
                sample = {
                    "input_ids": data["input_ids"][i],
                    "attention_mask": data["attention_mask"][i],
                    "files": file_path.split('/')[-1]
                }
                if data.get("labels") is not None:
                    sample["labels"] = data["labels"][i]
                yield sample

def load_checkpoint(repo_name, token, device, file_name="training_state.pt"):
    repo_name = repo_name

    training_state_path = hf_hub_download(
        repo_id=repo_name, 
        filename=file_name,
        token=token
    )
    checkpoint = torch.load(training_state_path, map_location=torch.device(device))
    return checkpoint

def get_grouped_params(model, weight_decay, no_decay=["bias", "LayerNorm.weight"]):
    '''handy function for setting weight decay shoutout to hugging face book '''
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [{'params': params_with_wd, 'weight_decay': weight_decay},
            {'params': params_without_wd, 'weight_decay': 0.0}]


def load_base_model(model_name, device):
    model = torch.compile(GPT2LMHeadModel.from_pretrained(model_name))
    return model.to(device)

def initialize_optimizer(model_params, base_lr):
    optimizer = torch.optim.Adam(
        params=model_params,
        lr=base_lr
    )
    return optimizer

def initialize_scheduler(optimizer, n_warmup_steps, n_training_steps):
    lr_scheduler = get_scheduler(
        name="cosine", 
        optimizer=optimizer, 
        num_warmup_steps=n_warmup_steps, 
        num_training_steps=n_training_steps
    )
    return lr_scheduler
    
def initialize_scaler(device):
    return torch.amp.GradScaler("cuda") if device == 'cuda' else None

def extract_file_numbers(filename):
    match = re.search(r'(\d+)', filename)
    return int(match.group(1)) if match else 0

def save_checkpoint(model, optimizer, lr_scheduler, global_step, loss_history, last_file, scaler=None):
    checkpoint = {
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'lr_scheduler': lr_scheduler.state_dict(),
        'global_step': global_step,
        'losses': loss_history,
        'batch_file': last_file,
    }

    # scaler is for GPU only since doing fp16 on GPU
    if scaler is not None:
        checkpoint['scaler'] = scaler.state_dict()

    # checkpoint file locally so we can easily push to hub
    torch.save(checkpoint, "training_state.pt")


def create_repo_if_not_exists(repo_name, token):
    api = HfApi(token=token)
    try:
        api.repo_info(repo_id=repo_name)
        print(f"Repository '{repo_name}' already exists.")
    except HfHubHTTPError as e:
        if e.response.status_code == 404:
            print(f"Repository '{repo_name}' not found. Creating repository...")
            create_repo(repo_id=repo_name, token=token)
            print(f"Repository '{repo_name}' created successfully.")
        else:
            raise e


def push_to_hub(repo_name, token, step, max_retries=3, retry_delay=10):
    api = HfApi(token=token)
    for attempt in range(1, max_retries + 1):
        try:
            # Upload the training state file.
            api.upload_file(
                path_or_fileobj="training_state.pt",
                path_in_repo="training_state.pt",
                repo_id=repo_name,
                commit_message=f"Training state at step {step}"
            )
            print("Training state pushed successfully.")
            break
        except Exception as e:
            print(f"Attempt {attempt} failed: {e}")
            if attempt == max_retries:
                print("Max attempts reached. Exiting.")
                raise e
            time.sleep(retry_delay)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class GPT2Config:
    device: str = 'cpu'
    from_checkpoint: bool = False
    data_loader_batch_size = 4
    warm_up_ratio: float = 0.01

    n_files: int = 221713
    rows_per_file = 16
    tokens_per_row = 1024
    n_tokens_per_file: int = rows_per_file*tokens_per_row # (file_batch_size x max_token_len)
    total_tokens: int = n_files * n_tokens_per_file
    gradient_accumulation_steps: int = 32
    tokens_per_batch: int = (n_tokens_per_file/data_loader_batch_size) * gradient_accumulation_steps
    print(f"Effective size with grad accumulation: {data_loader_batch_size*gradient_accumulation_steps}")
    print(f"Tokens per batch (paper has roughly .5M): {tokens_per_batch}")

    base_lr: float = 1e-4 # LR for should be 6e-4 to 2.5e-4 for gpt3 small-large
    n_training_steps: float = total_tokens / tokens_per_batch
    n_warmup_steps: int = int(round(n_training_steps * warm_up_ratio, 1))
    print(f"Total Training steps {n_training_steps}")
    print(f"N warmup steps (could be {warm_up_ratio*100:.2f}% of {n_training_steps} training_steps) => {n_warmup_steps} steps")

    # beta1, beta2 = 0.9, 0.95 # these may need to be changed to fit our training assumptions
    max_grad_norm = 1.0 # paper uses 1
    weight_decay = .10 # i believe this still makes sense
    num_epochs: int = 1

    checkpoint_repo: str = None
    save_file_name: str = "training_state.pt"
    hf_token: str = None
    start_file: str = None
    save_steps = 100

config = GPT2Config()
config.from_checkpoint = True
config.checkpoint_repo = "cwestnedge/gpt2-test"
config.base_model = "openai-community/gpt2-large"
config.hf_token = getpass.getpass("Enter your Hugging Face token: ")
config.device = "cuda" if torch.cuda.is_available() else "cpu"
config.base_lr = 2.5e-4
config.weight_decay = 0.0

# for testing
# config.num_epochs = 40
# config.save_steps=10
# config.base_lr=3e-4
# config.n_warmup_steps= (40*.1)
# config.gradient_accumulation_steps=1
# config.weight_decay= 0.0
# config.data_loader_batch_size = 2
# config.n_training_steps=40

Effective size with grad accumulation: 128
Tokens per batch (paper has roughly .5M): 131072.0
Total Training steps 27714.125
N warmup steps (could be 3.00% of 27714.125 training_steps) => 831 steps


In [None]:
# -------- Initialize mode, optimizer and lr_scheduler -------- 
model = load_base_model(model_name=config.base_model, device=config.device)
model_grouped_params = get_grouped_params(model, weight_decay=config.weight_decay)
optimizer = initialize_optimizer(model_grouped_params, base_lr=config.base_lr)
lr_scheduler = initialize_scheduler(
    n_warmup_steps=config.n_warmup_steps, 
    n_training_steps=config.n_training_steps, 
    optimizer=optimizer
)
scaler = initialize_scaler(config.device)

# -------- load from checkpoint or start fresh --------
if config.from_checkpoint: 
    checkpoint = load_checkpoint(
        repo_name=config.checkpoint_repo,
        token=config.hf_token,
        device=config.device, 
        file_name=config.save_file_name
    )
    
    model.load_state_dict(checkpoint['model']) # we want to log model state dict eventually model.load_state_dict(model.state_dict())
    optimizer.load_state_dict(checkpoint['optimizer'])
    lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
    if scaler:
        scaler.load_state_dict(checkpoint['scaler'])
    
    global_step = checkpoint['global_step']
    loss_history = checkpoint['losses']
    last_file = checkpoint['batch_file']
    last_file = ''.join(last_file)

    train_files_full = sorted(glob.glob("../processed_batches/train/*.pt"), key=extract_file_numbers)
    start_file_path = f'../processed_batches/train/{last_file}'
    start_idx = train_files_full.index(start_file_path)
    train_files_ = train_files_full[start_idx+1:] # fix this after testing to train_files_full[start_idx+1:]
    print()
    print(f'Last processed file {last_file}. Resuming run from {train_files_[0]}')
    print(f"{(len(train_files_)/len(train_files_full))*100:0.3f}% remaining...")

else:
    global_step, loss_history= 0, []
    train_files_ = sorted(glob.glob("../processed_batches/train/*.pt"), key=extract_file_numbers)
    print()
    print(f'training run from {train_files_[0]}')


train_ds = PTIterableDataset(train_files_)
train_loader = DataLoader(train_ds, batch_size=config.data_loader_batch_size, num_workers=0, drop_last=True)
print(next(iter(train_loader)))
print()

create_repo_if_not_exists(config.checkpoint_repo, config.hf_token)


training run from ../processed_batches/train/batch_0000.pt
{'input_ids': tensor([[ 1925,  5889,   286,  ...,   274,  2983,  3421],
        [  739,  4096,  3403,  ...,   349,   415, 39422]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'files': ['batch_0000.pt', 'batch_0000.pt']}

Repository 'cwestnedge/gpt2-test-cpu' already exists.


### CPU Training Loop 

In [None]:
model.train();
for epoch in range(config.num_epochs):
    running_loss = 0 
    for step, batch in enumerate(train_loader, start=1): 
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        current_file = set(batch['files'])

        # forward pass (no autocast for CPU)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        raw_loss = outputs.loss

        running_loss+=raw_loss.item()
        loss = raw_loss/config.gradient_accumulation_steps
        loss.backward()

        if step % config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            lr_scheduler.step()
            global_step+=1

            current_lr = optimizer.param_groups[0]['lr']
            effective_loss = running_loss/config.gradient_accumulation_steps
            loss_history.append(effective_loss)
            print(f"Global step {global_step}, LR: {current_lr:.8f}, Loss: {effective_loss:.4f}")
            running_loss = 0 

            if global_step % config.save_steps == 0:
                save_checkpoint(
                    model=model, 
                    optimizer=optimizer, 
                    lr_scheduler=lr_scheduler, 
                    global_step=global_step, 
                    loss_history=loss_history,
                    last_file=current_file,
                )
                
                print('saved checkpoint')
                push_to_hub(
                    repo_name=config.checkpoint_repo,
                    token=config.hf_token,
                    step=global_step,
                    max_retries=3,
                    retry_delay=10
                )
                print('hub push completed')

# print('final model push...')
# model.push_to_hub(config.checkpoint_repo, commit_message=f"trained model at pass {epoch}")

### GPU Training Loop (FP16)

In [None]:
model.train();
for epoch in range(config.num_epochs):
    running_loss = 0
    for step, batch in enumerate(train_loader, start=1):
        input_ids = batch['input_ids'].to(config.device)
        attention_mask = batch['attention_mask'].to(config.device)
        current_file = batch['files'][0]  # single filename for consistency

        with torch.autocast(device_type="cuda"):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            raw_loss = outputs.loss

        running_loss += raw_loss.item()
        loss = raw_loss / config.gradient_accumulation_steps
        scaler.scale(loss).backward()

        if step % config.gradient_accumulation_steps == 0:
            # unscale, clip, step, update scaler & scheduler
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()

            optimizer.zero_grad()
            lr_scheduler.step()
            global_step += 1

            current_lr = optimizer.param_groups[0]['lr']
            effective_loss = running_loss / config.gradient_accumulation_steps
            loss_history.append(effective_loss)
            print(f"Global step {global_step}, LR: {current_lr:.8f}, Loss: {effective_loss:.4f}")
            running_loss = 0

            if global_step % config.save_steps == 0:
                save_checkpoint(
                    model=model,
                    optimizer=optimizer,
                    lr_scheduler=lr_scheduler,
                    global_step=global_step,
                    loss_history=loss_history,
                    last_file=current_file,
                    scaler=scaler
                )
                print('saved checkpoint')
                push_to_hub(
                    repo_name=config.checkpoint_repo,
                    token=config.hf_token,
                    step=global_step
                )
                print('hub push completed')

# print('final model push...')
# model.push_to_hub(config.checkpoint_repo, commit_message="Final trained model")