In [1]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.2 MB[0m [31m14.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
import os
import time
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
from train_get2_1 import GPT, GPTConfig  # Import the model architecture

# Enhanced hyperparameters
BATCH_SIZE = 24  # Adjusted for memory constraints
BLOCK_SIZE = 128  # Keep smaller block size for efficiency
MAX_ITERS = 25000  # Increased iterations to compensate for smaller batch size
EVAL_INTERVAL = 100
LEARNING_RATE = 1e-4  # Reduced learning rate for smaller batch size
WARMUP_ITERS = 2000  # Extended warmup period
MIN_LR = 1e-5
WEIGHT_DECAY = 0.01  # Reduced weight decay for smaller batch
GRAD_CLIP = 0.5  # Reduced gradient clipping
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 768
n_head = 12
n_layer = 12
dropout = 0.1  # Reduced dropout for smaller batch

# Load and preprocess the Shakespeare text
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Initialize tokenizer
enc = tiktoken.get_encoding("gpt2")
vocab_size = enc.n_vocab

# Encode the entire text
data = torch.tensor(enc.encode(text), dtype=torch.long)

def get_batch():
    # Add data augmentation: randomly offset sequences
    ix = torch.randint(len(data) - BLOCK_SIZE - 1, (BATCH_SIZE,))
    offset = torch.randint(0, 2, (BATCH_SIZE,))  # Random offset of 0 or 1
    ix = ix + offset
    x = torch.stack([data[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+BLOCK_SIZE+1] for i in ix])
    x, y = x.to(DEVICE), y.to(DEVICE)
    return x, y

def get_lr(iter):
    # Implement learning rate scheduling
    if iter < WARMUP_ITERS:
        return LEARNING_RATE * iter / WARMUP_ITERS
    decay_ratio = (iter - WARMUP_ITERS) / (MAX_ITERS - WARMUP_ITERS)
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # cosine decay
    return MIN_LR + coeff * (LEARNING_RATE - MIN_LR)

# Model init
model_config = GPTConfig(
    block_size=BLOCK_SIZE,
    vocab_size=vocab_size,
    n_layer=n_layer,
    n_head=n_head,
    n_embd=n_embd,
)
model = GPT(model_config)
model = model.to(DEVICE)

# Print model parameters
print("Model Parameters (name and shape):")
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params/1e6:.2f}M")

# Optimizer with weight decay
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    betas=(0.9, 0.999)  # More conservative beta values
)

# Training loop
best_loss = float('inf')
running_loss = 0.0
beta = 0.98  # Increased smoothing for loss calculation
for iter in range(MAX_ITERS):
    # Learning rate scheduling
    lr = get_lr(iter)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # Sample a batch of data
    xb, yb = get_batch()

    # Forward pass
    logits, loss = model(xb, yb)

    # Calculate running loss
    running_loss = beta * running_loss + (1 - beta) * loss.item()

    # Print progress and save model if loss improves
    if iter % EVAL_INTERVAL == 0:
        print(f"step {iter}: loss {running_loss:.4f}, lr {lr:.6f}")
        if running_loss < best_loss:
            best_loss = running_loss
            print(f"New best loss: {best_loss:.4f} - Saving model...")
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'iter': iter,
                'best_loss': best_loss,
            }, 'shakespeare_model.pt')

    # Backward pass with gradient clipping
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
    optimizer.step()

# Generate sample text after training
model.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(enc.decode(model.generate(context, max_new_tokens=500)[0].tolist()))

loading weights from pretrained gpt: gpt2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


didn't crash yet!
> Hello, I'm a language model, not a program.

So this morning I started studying for the interview in the lab. This was not
> Hello, I'm a language model, and one of the main things that bothers me when they create languages is how easy it becomes to create something that
> Hello, I'm a language model, and I wrote it off on the grounds that a language model would make me more fluent. But I'm not
> Hello, I'm a language model, I really like languages. I like languages because like, they're good. And the way we talk about languages
> Hello, I'm a language model, a language model I'm using for data modelling. All I did was test the results and then I wrote some
Model Parameters (name and shape):
Total Parameters: 162.35M
step 0: loss 0.2199, lr 0.000000
New best loss: 0.2199 - Saving model...
step 100: loss 8.6011, lr 0.000005
step 200: loss 8.3040, lr 0.000010
step 300: loss 7.2572, lr 0.000015
step 400: loss 6.3143, lr 0.000020
step 500: loss 5.6861, lr 0.000025
step 