In [25]:
from datasets import load_dataset
from transformers import AutoTokenizer
from utils import TrainingConfig, Config
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")  # or your custom one
tokenizer.pad_token = tokenizer.eos_token

training_config = TrainingConfig()
config = Config(vocab_size=tokenizer.vocab_size,
d_model=768, num_heads=12, ffn_dim=3072,
num_layers=12, max_seq_len=tokenizer.model_max_length )

if False:
    # Create a config dictionary for the model
    config_dict = {k: getattr(config, k) for k in vars(config) 
                if not k.startswith('_') and not callable(getattr(config, k))}

    # Save the config as JSON
    import os
    import json

    os.makedirs(savepath, exist_ok=True)
    with open(os.path.join(savepath, "config.json"), "w") as f:
        json.dump(config_dict, f, indent=2)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

savepath = "/home/chrisobrien/dev/transformer-examples/models/roformer-base"

Using device: cuda


In [26]:
# 1. Load the raw text
ds = load_dataset("openwebtext", split="train", trust_remote_code=True)

if True:
    ds = ds.select(range(1000))

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [33]:

if True:
    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            max_length=config.max_seq_len,
            padding=True,
            return_tensors="pt",
        ).to(device)

    tokenized = ds.map(tokenize, batched=True, remove_columns=["text"])
    # tokenized = tokenized.remove_columns(["attention_mask"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [28]:
# Let's check a few examples for padding tokens and EOS tokens
for i in range(100):  # Check first 5 examples
    # Convert to tensor first, then do the comparison
    input_ids = torch.tensor(tokenized[i]["input_ids"])
    pad_mask = (input_ids == tokenizer.pad_token_id)
    pad_count = pad_mask.sum().item()
    
    # Check for EOS tokens
    eos_mask = (input_ids == tokenizer.eos_token_id)
    eos_count = eos_mask.sum().item()
    
    print(f"\nExample {i}:")
    print(f"Sequence length: {len(input_ids)}")
    
    # Check padding tokens
    if pad_count > 0:
        print(f"Pad tokens found: {pad_count}")
        # Show where the padding starts
        first_pad = (input_ids == tokenizer.pad_token_id).nonzero()[0].item()
        print(f"First padding token at position: {first_pad}")
    else:
        print("No padding tokens found")
    
    # Check EOS tokens
    if eos_count > 0:
        print(f"EOS tokens found: {eos_count}")
        # Show where the EOS tokens are
        eos_positions = (input_ids == tokenizer.eos_token_id).nonzero().flatten().tolist()
        print(f"EOS token positions: {eos_positions}")
    else:
        print("No EOS tokens found")


Example 0:
Sequence length: 1024
No padding tokens found
No EOS tokens found

Example 1:
Sequence length: 1024
No padding tokens found
No EOS tokens found

Example 2:
Sequence length: 1024
No padding tokens found
No EOS tokens found

Example 3:
Sequence length: 1024
No padding tokens found
No EOS tokens found

Example 4:
Sequence length: 1024
Pad tokens found: 353
First padding token at position: 671
EOS tokens found: 353
EOS token positions: [671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 7

In [38]:
len(tokenized[4]['attention_mask']), tokenized[4]['input_ids'][-1], tokenized[4]['attention_mask'][-1]

(1024, 50256, 0)

In [None]:
if False:
    # Load the tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    # Load the tokenized dataset from disk
    from datasets import load_from_disk

    dataset_name = "gpt2_tokenized_openwebtext"
    try:
        print("Loading tokenized dataset from disk...")
        tokenized = load_from_disk(dataset_name)
        print(f"Successfully loaded tokenized dataset with {len(tokenized)} examples")
    except FileNotFoundError:
        print(f"Dataset not found at {dataset_name}. Please make sure you've saved the tokenized dataset first.")


Loading tokenized dataset from disk...


Loading dataset from disk:   0%|          | 0/66 [00:00<?, ?it/s]

Successfully loaded tokenized dataset with 8013769 examples


In [6]:
if False:
    # Save the tokenized dataset to disk
    dataset_name = "gpt2_tokenized_openwebtext"
    tokenized.save_to_disk(dataset_name)

In [7]:
if False:
    # Upload to Hugging Face Hub
    # You'll need to be logged in to Hugging Face
    from huggingface_hub import login

    # Login to Hugging Face (you'll need to run this once and enter your token)
    # Uncomment the line below when you're ready to login
    login("hf_xxxx")

In [8]:
if False:
    # Save the tokenized dataset to disk
    dataset_name = "gpt2_tokenized_openwebtext"
    username = "chrisjob1021"

    # Upload to Hugging Face Hub
    # You'll need to be logged in to Hugging Face
    from huggingface_hub import HfApi

    # Initialize the Hugging Face API
    api = HfApi()

    # Upload the dataset to the Hub
    # Replace "your-username/tokenized-openwebtext" with your desired repository name
    try:
        api.create_repo(
            repo_id=username + "/" + dataset_name,
            repo_type="dataset",
            exist_ok=True
        )
        
        api.upload_folder(
            folder_path=dataset_name,
            repo_id=username + "/" + dataset_name,
            repo_type="dataset"
        )
        
        print("Dataset successfully uploaded to Hugging Face Hub!")
    except Exception as e:
        print(f"Error uploading dataset: {e}")
        print("You may need to login first with `login()` or check your permissions.")


In [9]:
# Initialize the model with random weights
if False:
    model_base = RoFormerEncoder(config)
    model = RoFormerForCausalLM(model_base, config)

In [10]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from models import RoFormerForCausalLM
import torch

model = RoFormerForCausalLM.from_pretrained(savepath)
model = model.to(device)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

import os
# Get the absolute path for logs
log_dir = os.path.join(os.path.dirname(savepath), "logs")
# Create the logging directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)

args = TrainingArguments(
    output_dir=savepath,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=12,
    gradient_accumulation_steps=8, # Accumulate gradients over N steps
    #With gradient accumulation (gradient_accumulation_steps=8):
        # You split what would have been one batch into 8 smaller micro-batches
        # For each micro-batch, you:
        # Load 1/8th of the data into memory
        # Do a forward pass (storing 1/8th of the activations)
        # Do a backward pass (computing 1/8th of the gradients)
        # ACCUMULATE the gradients (don't update weights yet)
        # Clear the activations (but keep gradients)
    
    warmup_steps=100,
    logging_dir=log_dir,
    logging_steps=10,
    save_steps=10,
    save_total_limit=50,
    save_strategy="steps",
    save_safetensors=False,
    # report_to="tensorboard",
    gradient_checkpointing=False,

    #With Gradient Checkpointing:
        # During the forward pass, only store activations at certain "checkpoints"
        # During backpropagation, RECOMPUTE the intermediate activations as needed
        # This means doing some forward computations twice, but using much less memory
    # Without checkpointing, you need to store activations for all 12 layers. With checkpointing, you might only store activations every few layers and recompute the rest during backprop.
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    data_collator=data_collator,
)


Loading checkpoint: checkpoint-8630


In [None]:
if False:
    trainer.train(resume_from_checkpoint=savepath + "/checkpoint-210")


In [12]:
sample_batch = next(iter(trainer.get_train_dataloader()))
sample_batch['input_ids'].shape

torch.Size([12, 1024])

In [13]:
if False:
    # Clear CUDA cache to free up GPU memory
    torch.cuda.empty_cache()
    
    # Print memory stats before and after clearing cache
    print(f"GPU memory allocated before clearing cache: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    torch.cuda.empty_cache()
    print(f"GPU memory allocated after clearing cache: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    
    # Optional: force garbage collection as well
    import gc
    gc.collect()

In [19]:
if True:
    # # After creating the model but before training
    # print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    # print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Print input_ids
    # print(f"Input IDs sample: {sample_input_ids[0, :10]}")  # Print first 10 input IDs of first batch
    print(f"Decoded input: {tokenizer.decode(sample_input_ids[0, :10])}")  # Decode the first 10 tokens
    
    # Forward pass
    outputs = model(sample_input_ids, labels=sample_labels)
    # print(f"\nSample batch statistics:")
    # print(f"Input shape: {sample_input_ids.shape}")
    # print(f"Labels shape: {sample_labels.shape}")
    # print(f"Labels sample: {sample_labels[0, :10]}")  # Print first 10 labels of first batch
    # print(f"Decoded labels: {tokenizer.decode([l.item() for l in sample_labels[0, :10] if l.item() != -100])}")  # Decode the first 10 labels, skipping masked tokens

    print(f"Loss: {outputs['loss'].item():.6f}")

    # print(f"\nLogits shape: {outputs['logits'].shape}")
    # # Print shapes of intermediate outputs
    sequence_length = sample_input_ids.size(1)
    # vocab_size = outputs['logits'].shape[-1]
    # print(f"Flattened logits shape: {outputs['logits'].view(batch_size * sequence_length, vocab_size).shape}")
    # print(f"Flattened labels shape: {sample_labels.view(batch_size * sequence_length).shape}")

    # print(f"Logits mean: {outputs['logits'].mean().item():.6f}")
    # print(f"Logits std: {outputs['logits'].std().item():.6f}")
    # print(f"Logits sample: {outputs['logits'][0, 0, :5]}")  # Print first 5 logits of first token
    
    # print(f"\nTop 3 predicted tokens sample: {topk_indices[0, :10]}")  # Print first 10 sets of predictions
    
    # Add top-p (nucleus) sampling
    logits = outputs['logits'][0, :10]  # First batch, first 10 positions
    top_k = 50
    topk_values, topk_indices = torch.topk(logits, k=top_k, dim=-1)
    softmax_logits = torch.nn.functional.softmax(logits, dim=-1)
    
    print(f"\nComparison - Top-k and Top-p Predictions vs Labels:")
    for i in range(min(5, sequence_length)):
        # Top-k results
        top_k_tokens = []
        for k in range(5):
            token = tokenizer.decode(topk_indices[i, k].item())
            top_k_tokens.append(f"{k+1}.'{token}'")
        top_k_str = " ".join(top_k_tokens)
        
        # Top-p (nucleus) sampling
        sorted_probs, sorted_indices = torch.sort(softmax_logits[i], descending=True)
        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
        nucleus_indices = sorted_indices[cumulative_probs <= 0.9]
        nucleus_size = len(nucleus_indices)
        
        # Sample from the nucleus
        if nucleus_size > 0:
            nucleus_probs = sorted_probs[:nucleus_size]
            nucleus_probs = nucleus_probs / nucleus_probs.sum()  # Renormalize probabilities
            nucleus_sample_idx = torch.multinomial(nucleus_probs, 1).item()
            nucleus_token_idx = nucleus_indices[nucleus_sample_idx].item()
            nucleus_token = tokenizer.decode(nucleus_token_idx)
        else:
            nucleus_token = tokenizer.decode(sorted_indices[0].item())
        # Get actual label
        label_token = tokenizer.decode(sample_labels[0, i+1].item()) if sample_labels[0, i+1].item() != -100 else "[MASKED]"
        # Print results
        print(f"Position {i}:")
        # print(f"  Top-k: 1.'{top1_token}' 2.'{top2_token}' 3.'{top3_token}'")
        print(f"  Top-k: {top_k_str}")
        print(f"  Top-p: nucleus size={nucleus_size} (p=0.9), sampled='{nucleus_token}'")
        print(f"  Label: '{label_token}'")

Decoded input: The biggest opportunities for bitcoin and the blockchain lie in
Loss: 7.188998

Comparison - Top-k and Top-p Predictions vs Labels:
Position 0:
  Top-k: 1.' and' 2.' government' 3.'�' 4.',' 5.'.'
  Top-p: nucleus size=9503 (p=0.9), sampled='atives'
  Label: ' biggest'
Position 1:
  Top-k: 1.'.' 2.' and' 3.',' 4.')' 5.'�'
  Top-p: nucleus size=9294 (p=0.9), sampled=' life'
  Label: ' opportunities'
Position 2:
  Top-k: 1.' to' 2.'.' 3.',' 4.' and' 5.' that'
  Top-p: nucleus size=48 (p=0.9), sampled=' that'
  Label: ' for'
Position 3:
  Top-k: 1.' a' 2.' the' 3.' $' 4.' some' 5.' his'
  Top-p: nucleus size=5273 (p=0.9), sampled=' possible'
  Label: ' bitcoin'
Position 4:
  Top-k: 1.',' 2.'.' 3.' and' 4.' is' 5.'�'
  Top-p: nucleus size=3828 (p=0.9), sampled=' and'
  Label: ' and'


In [15]:
# Add this to your current debugging cell in roformer_training.ipynb
if False:
    # Existing initialization checks
    print("Checking model initialization:")
    print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Track intermediate values through the model
    with torch.no_grad():
        # 1. Check embeddings output
        print("\n=== Embeddings Layer ===")
        embedded = model.backbone.embeddings(sample_input_ids)
        print(f"Embeddings output mean: {embedded.mean().item():.6f}")
        print(f"Embeddings output std: {embedded.std().item():.6f}")
        
        # 2. Track through each transformer layer
        x = embedded
        for i, layer in enumerate(model.backbone.layers):
            print(f"\n=== Transformer Layer {i} ===")
            
            # 2.1 Self-attention
            # Store original input for residual
            layer_input = x
            
            # Get attention outputs
            attn_output = layer.self_attn(
                q=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                k=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                v=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2)
            )
            
            print(f"Attention scores stats:")
            print(f"  Mean: {attn_output.mean().item():.6f}")
            print(f"  Std: {attn_output.std().item():.6f}")
            
            # 2.2 First residual + layer norm
            x = layer_input + layer.dropout1(attn_output)
            x = layer.ln1(x)
            print(f"After first layer norm:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # 2.3 FFN
            ffn_output = layer.ffn(x)
            print(f"FFN output stats:")
            print(f"  Mean: {ffn_output.mean().item():.6f}")
            print(f"  Std: {ffn_output.std().item():.6f}")
            
            # 2.4 Second residual + layer norm
            x = x + layer.dropout2(ffn_output)
            x = layer.ln2(x)
            print(f"Layer {i} final output:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # Check if output is close to input
            similarity = torch.cosine_similarity(layer_input.view(-1), x.view(-1), dim=0)
            print(f"  Cosine similarity with layer input: {similarity.item():.6f}")
        
        # 3. Final LM head
        print("\n=== LM Head Layer ===")
        logits = model.lm_head(x)
        print(f"Final logits stats:")
        print(f"  Mean: {logits.mean().item():.6f}")
        print(f"  Std: {logits.std().item():.6f}")
        
        # 4. Check weight tying
        print("\n=== Weight Tying Check ===")
        print(f"Embeddings weight sum: {model.backbone.embeddings.weight.sum().item():.6f}")
        print(f"LM head weight sum: {model.lm_head.weight.sum().item():.6f}")
        print(f"Are weights identical? {torch.allclose(model.backbone.embeddings.weight, model.lm_head.weight)}")
        
        # 5. Compare predictions with input
        predictions = torch.argmax(logits, dim=-1)
        print("\n=== Input vs Predictions ===")
        print("First 5 tokens:")
        for i in range(5):
            input_token = tokenizer.decode(sample_input_ids[0, i].item())
            pred_token = tokenizer.decode(predictions[0, i].item())
            print(f"Position {i}:")
            print(f"  Input: '{input_token}'")
            print(f"  Predicted: '{pred_token}'")
            print(f"  Token IDs - Input: {sample_input_ids[0, i].item()}, Predicted: {predictions[0, i].item()}")