In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tiktoken
from tqdm.notebook import tqdm
import time
from modeling_gpt2 import GPT2
from utils import create_dataloader_v1
import requests

# Configuration for GPT-2 model
GPT_CONFIG = {
    "vocab_size": 50257,
    "context_len": 256,
    "emb_dim": 768,
    "num_heads": 12,
    "num_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

# Training Hyperparameters
EPOCHS = 20
LEARNING_RATE = 3e-4
BATCH_SIZE = 16
EVAL_INTERVAL = 100  # Evaluate every 100 steps
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
START_CONTEXT = "All:"
# Seed for reproducibility
torch.manual_seed(123)

# Load the GPT-2 model
model = GPT2(GPT_CONFIG).to(DEVICE)
tokenizer = tiktoken.get_encoding("gpt2")
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

# Function to load text data
def load_text_file(path_to_text=None, url=None):
    if path_to_text:
        with open(path_to_text, 'r', encoding="utf-8") as f:
            return f.read()
    elif url:
        response = requests.get(url)
        return response.text
    else:
        return None

# Load dataset
text = load_text_file(path_to_text="/teamspace/studios/this_studio/GPT2/model/shakespeare.txt")
split_idx = int(0.90 * len(text))
train_data, val_data = text[:split_idx], text[split_idx:]

# Create DataLoader for training and validation
train_loader = create_dataloader_v1(
    txt=train_data,
    batch_size=BATCH_SIZE,
    max_length=GPT_CONFIG['context_len'],
    stride=GPT_CONFIG['context_len'],
    shuffle=False,
    drop_last=False,
    num_workers=0
)

val_loader = create_dataloader_v1(
    txt=val_data,
    batch_size=BATCH_SIZE,
    max_length=GPT_CONFIG['context_len'],
    stride=GPT_CONFIG['context_len'],
    shuffle=False,
    drop_last=False,
    num_workers=0
)

# Initialize optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

# Helper functions for encoding/decoding text
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    return torch.tensor(encoded).unsqueeze(0)  # Add batch dimension

def token_ids_to_text(token_ids, tokenizer):
    return tokenizer.decode(token_ids.squeeze(0).tolist())  # Remove batch dimension

# Functions to calculate loss and evaluate the model
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    return F.cross_entropy(logits.flatten(0, 1), target_batch.flatten())

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    num_batches = min(num_batches or len(data_loader), 100)  # Limit to 100 batches for faster evaluation
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            total_loss += calc_loss_batch(input_batch, target_batch, model, device).item()
        else:
            break
            
    return total_loss / num_batches

def generate(model, device, idx, max_new_tokens, context_len):
    idx = idx.to(device)  

    for _ in range(max_new_tokens):
        idx_context = idx[:, -context_len:]

        with torch.no_grad():
            idx_context = idx_context.to(device)
            logits = model(idx_context)
            logits = logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.argmax(probs, dim=-1, keepdim=True)
            idx = torch.cat((idx, idx_next), dim=-1)
    
    return idx


def evaluate_model(model, train_loader, val_loader, device, eval_iter=None):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

# Lists to track metrics
train_losses = []
val_losses = []
grad_norms = []
step_times = []
learning_rates = []

# Training loop
for epoch in range(EPOCHS):
    model.train()
    
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for step, (x, y) in progress_bar:
        start_time = time.time()
        
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        
        loss = calc_loss_batch(x, y, model, DEVICE)
        loss.backward()
        
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        # Track metrics
        train_losses.append(loss.item())
        grad_norms.append(norm)
        step_times.append(time.time() - start_time)
        learning_rates.append(scheduler.get_last_lr()[0])
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f"{loss.item():.4f}",
            'grad_norm': f"{norm:.4f}",
            'step_time': f"{step_times[-1]:.4f}s",
            'lr': f"{learning_rates[-1]:.6f}"
        })
        
        if (step + 1) % EVAL_INTERVAL == 0:
            train_loss, val_loss = evaluate_model(model, train_loader, val_loader, DEVICE)
            print(f"\nStep {step+1} - Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
            
        
    scheduler.step()
    
    # End of epoch evaluation
    train_loss, val_loss = evaluate_model(model, train_loader, val_loader, DEVICE)
    print(f"\nEpoch {epoch+1} - Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        # Generate sample text
    token_ids = generate(
                model=model,
                device=DEVICE,
                idx=text_to_token_ids(START_CONTEXT, tokenizer),
                max_new_tokens=20,
                context_len=GPT_CONFIG["context_len"],
    )
    print("Sample text:", token_ids_to_text(token_ids, tokenizer))
    

# Final evaluation
final_train_loss, final_val_loss = evaluate_model(model, train_loader, val_loader, DEVICE)
print(f"\nFinal - Train Loss: {final_train_loss:.4f}, Validation Loss: {final_val_loss:.4f}")

Total number of parameters: 134,117,376


Epoch 1/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 1 - Train Loss: 5.7158, Validation Loss: 5.9255
Sample text: All:






















Epoch 2/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 2 - Train Loss: 5.0909, Validation Loss: 5.4978
Sample text: All:
I'll you,






I'll:

I'll you


Epoch 3/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 3 - Train Loss: 4.7138, Validation Loss: 5.2847
Sample text: All:
I'll not a man,
I am a man,
I am a man,



Epoch 4/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 4 - Train Loss: 4.4329, Validation Loss: 5.1599
Sample text: All:
I am not,
I am not,



I'll be a little of


Epoch 5/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 5 - Train Loss: 4.0885, Validation Loss: 5.0594
Sample text: All:
I am not, and let me,
And I am a gentleman,
And,



Epoch 6/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 6 - Train Loss: 3.7864, Validation Loss: 5.0295
Sample text: All:
I do not my lord,
And take my lord,
And I am not my lord


Epoch 7/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 7 - Train Loss: 3.4899, Validation Loss: 5.0701
Sample text: All:
I do not my lord,
And therefore,
That I'll be a better-morrow


Epoch 8/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 8 - Train Loss: 3.1726, Validation Loss: 5.1430
Sample text: All:
I do not my lord:
I'll do me;
I'll do not a my


Epoch 9/20:   0%|          | 0/74 [00:00<?, ?it/s]


Epoch 9 - Train Loss: 2.7689, Validation Loss: 5.1795
Sample text: All:
O:
O:
O, sir!

DUKE VINCENTIO


Epoch 10/20:   0%|          | 0/74 [00:00<?, ?it/s]

In [None]:
torch.save(model.state_dict(), "model.pth")

In [None]:
txt = """
First Citizen:
Very well; and could be
"""

In [None]:
token_ids = generate(
            model=model,
            device=DEVICE,
            idx=text_to_token_ids(txt, tokenizer),
            max_new_tokens=100,
            context_len=GPT_CONFIG["context_len"]
            )
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="model.pth",
    path_in_repo="gpt_model.pt",
    repo_id="damerajee/smallgpt",
    repo_type="model",
)