In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tiktoken
from tqdm.auto import tqdm
import time
from modeling_gpt2 import GPT2
import requests
from generate import text_to_token_ids ,  token_ids_to_text , generate 
from utils import create_dataloader_v1 , calc_loss_batch , calc_loss_loader , evaluate_model

# Configuration for GPT-2 model
GPT_CONFIG = {
    "vocab_size": 50257,
    "context_len": 256,
    "emb_dim": 768,
    "num_heads": 12,
    "num_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

# Training Hyperparameters
EPOCHS = 20
LEARNING_RATE = 3e-4
BATCH_SIZE = 16
EVAL_INTERVAL = 74  # Evaluate every 100 steps
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
START_CONTEXT = "First Person:"
# Seed for reproducibility
torch.manual_seed(123)

# Load the GPT-2 model
model = GPT2(GPT_CONFIG).to(DEVICE)
tokenizer = tiktoken.get_encoding("gpt2")
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

# Function to load text data
def load_text_file(path_to_text=None, url=None):
    if path_to_text:
        with open(path_to_text, 'r', encoding="utf-8") as f:
            return f.read()
    elif url:
        response = requests.get(url)
        return response.text
    else:
        return None

# Load dataset
text = load_text_file(path_to_text="/teamspace/studios/this_studio/GPT2/model/shakespeare.txt")
split_idx = int(0.90 * len(text))
train_data, val_data = text[:split_idx], text[split_idx:]

print("Train data:",len(train_data))
print("Val data:",len(val_data))

# Create DataLoader for training and validation
train_loader = create_dataloader_v1(
    txt=train_data,
    batch_size=BATCH_SIZE,
    max_length=GPT_CONFIG['context_len'],
    stride=GPT_CONFIG['context_len'],
    shuffle=False,
    drop_last=False,
    num_workers=2
)

val_loader = create_dataloader_v1(
    txt=val_data,
    batch_size=BATCH_SIZE,
    max_length=GPT_CONFIG['context_len'],
    stride=GPT_CONFIG['context_len'],
    shuffle=False,
    drop_last=False,
    num_workers=2
)

# Initialize optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)


# Lists to track metrics
train_losses = []
val_losses = []
grad_norms = []
step_times = []
learning_rates = []

#  torch.compile makes PyTorch code run faster by JIT-compiling PyTorch code into optimized kernels, all while requiring minimal code changes.
model = torch.compile(model)
torch.set_float32_matmul_precision('high')

# Training loop
print("STARTING TO TRAIN")
for epoch in range(EPOCHS):
    model.train()
    
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for step, (x, y) in progress_bar:
        start_time = time.time()
        
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        
        # Use autocast with float16 for mixed precision training
        with torch.autocast(device_type=DEVICE , dtype=torch.float16):
            loss = calc_loss_batch(x, y, model, DEVICE)
        
        loss.backward()
        
        norm = nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        # Track metrics
        train_losses.append(loss.item())
        grad_norms.append(norm)
        end_time = time.time()
        step_times.append((end_time - start_time) * 1000)  # Convert to milliseconds
        learning_rates.append(scheduler.get_last_lr()[0])
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f"{loss.item():.4f}",
            'grad_norm': f"{norm:.4f}",
            'step_time': f"{step_times[-1]:.4f}ms",  # Display time in milliseconds
            'lr': f"{learning_rates[-1]:.6f}"
        })
        
        # Generate sample text after each step
        token_ids = generate(
                    model=model,
                    device=DEVICE,
                    idx=text_to_token_ids(START_CONTEXT, tokenizer),
                    max_new_tokens=10,
                    context_len=GPT_CONFIG["context_len"],
        )
        
        if (step + 1) % EVAL_INTERVAL == 0:
            train_loss, val_loss = evaluate_model(model, train_loader, val_loader, DEVICE)
            
            print(f"\nStep {step+1} - Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
            print(f"Sample text:", token_ids_to_text(token_ids, tokenizer))

            
    scheduler.step()

    # End of epoch evaluation
    train_loss, val_loss = evaluate_model(model, train_loader, val_loader, DEVICE)
    print(f"\nEpoch {epoch+1} - Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")



In [None]:
torch.save(model.state_dict(), "model_2.pth")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="model.pth",
    path_in_repo="gpt_model.pt",
    repo_id="damerajee/smallgpt",
    repo_type="model",
)

In [None]:
text = """
First Citizen:
I say unto you, what 
"""

In [None]:
token_ids = generate(
                    model=model,
                    device=DEVICE,
                    idx=text_to_token_ids(START_CONTEXT, tokenizer),
                    max_new_tokens=10,
                    context_len=GPT_CONFIG["context_len"],
        )
        

In [None]:
print(f"Sample text:", token_ids_to_text(token_ids, tokenizer))
