In [1]:
if True:
    import debugpy
    
    # Try to listen on the port, catch exception if already listening
    try:
        debugpy.listen(("localhost", 5678))
        print("Debugpy is listening on localhost:5678")
    except RuntimeError as e:
        print(f"Debugpy is already listening: {e}")

Debugpy is listening on localhost:5678


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from utils import TrainingConfig, Config
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")  # or your custom one
tokenizer.pad_token = tokenizer.eos_token

training_config = TrainingConfig()
config = Config(vocab_size=tokenizer.vocab_size,
d_model=768, num_heads=12, ffn_dim=3072,
num_layers=12, max_seq_len=tokenizer.model_max_length, enable_rope=True )

savepath = "/home/chrisobrien/roformer-base"

if False:
    # Create a config dictionary for the model
    config_dict = {k: getattr(config, k) for k in vars(config) 
                if not k.startswith('_') and not callable(getattr(config, k))}

    # Save the config as JSON
    import os
    import json

    os.makedirs(savepath, exist_ok=True)
    with open(os.path.join(savepath, "config.json"), "w") as f:
        json.dump(config_dict, f, indent=2)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

dataset_name = "gpt2_tokenized_concatenated_openwebtext"

Using device: cuda


In [3]:
if False:
    # 1. Load the raw text
    ds = load_dataset("openwebtext", split="train", trust_remote_code=True)

    if False:
        ds = ds.select(range(1000))

In [4]:
if False:
    from datasets import load_from_disk

    tokenized = load_from_disk("gpt2_tokenized_openwebtext")
    tokenized

In [5]:
if False:
    ds = tokenized
    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=False,
            # max_length=config.max_seq_len,
            padding=False,
            # return_tensors="pt",
        ).to(device)

    tokenized = ds.map(tokenize, batched=True, remove_columns=["text"], num_proc=24)
    # tokenized = tokenized.remove_columns(["attention_mask"])

In [6]:
if False:
    def group_texts(batch):
        concatenated = sum(batch["input_ids"], [])
        total_length = len(concatenated)
        total_length = (total_length // config.max_seq_len) * config.max_seq_len
        result = {
            "input_ids": [concatenated[i:i+config.max_seq_len] for i in range(0, total_length, config.max_seq_len)]
        }
        return result
    
    lm_dataset = tokenized.remove_columns(["attention_mask"])
    if False: lm_dataset = lm_dataset.select(range(1000))
    lm_dataset = lm_dataset.map(group_texts, batched=True, num_proc=24)
    lm_dataset

In [7]:
if False:
    # Save the tokenized dataset to disk
    lm_dataset.save_to_disk(dataset_name)
    

In [8]:
if True:
    from datasets import load_from_disk

    lm_dataset = load_from_disk(dataset_name)
    lm_dataset

Loading dataset from disk:   0%|          | 0/73 [00:00<?, ?it/s]

In [9]:
if False: lm_dataset = lm_dataset.select(range(1000))
# Split the dataset into training and evaluation sets
train_test_split = lm_dataset.train_test_split(test_size=0.01, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

# Display the datasets
train_dataset, eval_dataset

Training dataset size: 8728153
Evaluation dataset size: 88164


(Dataset({
     features: ['input_ids'],
     num_rows: 8728153
 }),
 Dataset({
     features: ['input_ids'],
     num_rows: 88164
 }))

In [10]:
if False:
    ds = lm_dataset
    # Let's check a few examples for padding tokens and EOS tokens
    for i in range(100):  # Check first 5 examples
        # Convert to tensor first, then do the comparison
        input_ids = torch.tensor(ds[i]["input_ids"])
        pad_mask = (input_ids == tokenizer.pad_token_id)
        pad_count = pad_mask.sum().item()
        
        # Check for EOS tokens
        eos_mask = (input_ids == tokenizer.eos_token_id)
        eos_count = eos_mask.sum().item()
        
        print(f"\nExample {i}:")
        print(f"Sequence length: {len(input_ids)}")
        
        # Check padding tokens
        if pad_count > 0:
            print(f"Pad tokens found: {pad_count}")
            break
        else:
            print("No padding tokens found")
        
        # # Check EOS tokens
        # if eos_count > 0:
        #     print(f"EOS tokens found: {eos_count}")
        #     # Show where the EOS tokens are
        #     eos_positions = (input_ids == tokenizer.eos_token_id).nonzero().flatten().tolist()
        #     print(f"EOS token positions: {eos_positions}")
        # else:
        #     print("No EOS tokens found")

In [11]:
if False:
    # Load the tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    # Load the tokenized dataset from disk
    from datasets import load_from_disk

    dataset_name = "gpt2_tokenized_openwebtext"
    try:
        print("Loading tokenized dataset from disk...")
        tokenized = load_from_disk(dataset_name)
        print(f"Successfully loaded tokenized dataset with {len(tokenized)} examples")
    except FileNotFoundError:
        print(f"Dataset not found at {dataset_name}. Please make sure you've saved the tokenized dataset first.")


In [12]:
if False:
    # Save the tokenized dataset to disk
    dataset_name = "gpt2_tokenized_openwebtext"
    tokenized.save_to_disk(dataset_name)

In [13]:
if False:
    # Upload to Hugging Face Hub
    # You'll need to be logged in to Hugging Face
    from huggingface_hub import login

    # Login to Hugging Face (you'll need to run this once and enter your token)
    # Uncomment the line below when you're ready to login
    login("hf_xxxx")

In [14]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, EarlyStoppingCallback
from models import RoFormerForCausalLM, RoFormerDecoder
import torch
import os
from datetime import datetime

model_base = RoFormerDecoder(config)
model = RoFormerForCausalLM(model_base, config)
model = model.to(device)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Get the absolute path for logs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_dir = os.path.join(os.path.dirname(savepath), "logs", f"run_{timestamp}")
# Create the logging directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)

args = TrainingArguments(
    output_dir=savepath,

    learning_rate=6e-4,
    # lr_scheduler_type="cosine_with_restarts",
    # lr_scheduler_type="cosine",
    warmup_ratio=0.1,  # 10% of total training steps for warmup
    # warmup_steps=2_000,
    # Specify AdamW optimizer
    optim="adamw_torch",
    weight_decay=0.01,
    max_grad_norm=0.5,

    max_steps=2_000,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16, # Accumulate gradients over N steps
    #With gradient accumulation (gradient_accumulation_steps=8):
        # You split what would have been one batch into 8 smaller micro-batches
        # For each micro-batch, you:
        # Load 1/8th of the data into memory
        # Do a forward pass (storing 1/8th of the activations)
        # Do a backward pass (computing 1/8th of the gradients)
        # ACCUMULATE the gradients (don't update weights yet)
        # Clear the activations (but keep gradients)

    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,

    eval_steps=100,
    eval_strategy="steps",
    eval_accumulation_steps=16,
    per_device_eval_batch_size=4,

    logging_dir=log_dir,
    logging_steps=50,

    save_steps=100,
    save_total_limit=10,
    save_strategy="steps",
    save_safetensors=False,

    gradient_checkpointing=False,
    # Must be supported by the model
    #With Gradient Checkpointing:
        # During the forward pass, only store activations at certain "checkpoints"
        # During backpropagation, RECOMPUTE the intermediate activations as needed
        # This means doing some forward computations twice, but using much less memory
    # Without checkpointing, you need to store activations for all 12 layers. With checkpointing, you might only store activations every few layers and recompute the rest during backprop.
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)
    ]
)


In [15]:
if False:
    trainer.train(resume_from_checkpoint=savepath + "/checkpoint-210")
if True:
    trainer.train()


Step,Training Loss,Validation Loss


In [16]:
sample_batch = next(iter(trainer.get_train_dataloader()))
sample_batch

{'input_ids': tensor([[ 4602,   284,   307,  ...,    11,   447,   251],
        [   25,  3363,    11,  ...,   355,   340, 14051],
        [  407,  2245,  6095,  ...,   198,  2394,   475],
        [  262,   640,   262,  ...,   532,   290,  6825]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'labels': tensor([[ 4602,   284,   307,  ...,    11,   447,   251],
        [   25,  3363,    11,  ...,   355,   340, 14051],
        [  407,  2245,  6095,  ...,   198,  2394,   475],
        [  262,   640,   262,  ...,   532,   290,  6825]], device='cuda:0')}

In [17]:
model(sample_batch['input_ids'], sample_batch['attention_mask'], sample_batch['labels'])

{'loss': tensor([436.6937], device='cuda:0', grad_fn=<UnsqueezeBackward0>),
 'logits': tensor([[[ 34.2782,   3.1532,  26.2072,  ...,  24.9944,  29.4306,  -6.2552],
          [ 10.0444,  20.4900,  38.4594,  ...,  15.2377,  44.5526, -25.0688],
          [  2.3788,  12.9716,  11.8669,  ...,  18.8346,  49.7358, -42.2385],
          ...,
          [-25.4352,   0.2880,  -3.5974,  ..., -24.5217,  13.2999,  -3.8325],
          [-15.6319, -16.8085,  -8.0010,  ...,   9.2895,   0.0699, -58.9580],
          [-19.8348,  42.4760, -35.9720,  ...,   2.0393,  22.5927, -20.3634]],
 
         [[  9.3869, -21.8179,  -5.9259,  ...,  18.8216,   1.7853,  -6.8613],
          [  3.3824, -14.6326,   2.3107,  ...,  13.4163,  -2.5109,  10.4513],
          [ -3.8203,  -4.2022,  -5.0340,  ..., -17.9625,  -9.3158,   0.6999],
          ...,
          [ 14.9311,  -8.3173, -24.1865,  ...,  43.7960, -42.2001, -38.4019],
          [-17.2776,  39.2271,  13.2699,  ..., -20.1246,  29.3764,  -0.3817],
          [ 14.7649,  3

In [18]:
if False:
    # Clear CUDA cache to free up GPU memory
    torch.cuda.empty_cache()
    
    # Print memory stats before and after clearing cache
    print(f"GPU memory allocated before clearing cache: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    torch.cuda.empty_cache()
    print(f"GPU memory allocated after clearing cache: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    
    # Optional: force garbage collection as well
    import gc
    gc.collect()

In [19]:
if True:
    # # After creating the model but before training
    # print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    # print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Print input_ids
    # print(f"Input IDs sample: {sample_input_ids[0, :10]}")  # Print first 10 input IDs of first batch
    print(f"Decoded input: {tokenizer.decode(sample_input_ids[0, :10])}")  # Decode the first 10 tokens
    
    # Forward pass
    outputs = model(sample_input_ids, labels=sample_labels)
    # print(f"\nSample batch statistics:")
    # print(f"Input shape: {sample_input_ids.shape}")
    # print(f"Labels shape: {sample_labels.shape}")
    # print(f"Labels sample: {sample_labels[0, :10]}")  # Print first 10 labels of first batch
    # print(f"Decoded labels: {tokenizer.decode([l.item() for l in sample_labels[0, :10] if l.item() != -100])}")  # Decode the first 10 labels, skipping masked tokens

    print(f"Loss: {outputs['loss'].item():.6f}")

    # print(f"\nLogits shape: {outputs['logits'].shape}")
    # # Print shapes of intermediate outputs
    sequence_length = sample_input_ids.size(1)
    # vocab_size = outputs['logits'].shape[-1]
    # print(f"Flattened logits shape: {outputs['logits'].view(batch_size * sequence_length, vocab_size).shape}")
    # print(f"Flattened labels shape: {sample_labels.view(batch_size * sequence_length).shape}")

    # print(f"Logits mean: {outputs['logits'].mean().item():.6f}")
    # print(f"Logits std: {outputs['logits'].std().item():.6f}")
    # print(f"Logits sample: {outputs['logits'][0, 0, :5]}")  # Print first 5 logits of first token
    
    # print(f"\nTop 3 predicted tokens sample: {topk_indices[0, :10]}")  # Print first 10 sets of predictions
    
    # Add top-p (nucleus) sampling
    logits = outputs['logits'][0, :10]  # First batch, first 10 positions
    top_k = 50
    topk_values, topk_indices = torch.topk(logits, k=top_k, dim=-1)
    softmax_logits = torch.nn.functional.softmax(logits, dim=-1)
    
    print(f"\nComparison - Top-k and Top-p Predictions vs Labels:")
    for i in range(min(5, sequence_length)):
        # Top-k results
        top_k_tokens = []
        for k in range(5):
            token = tokenizer.decode(topk_indices[i, k].item())
            top_k_tokens.append(f"{k+1}.'{token}'")
        top_k_str = " ".join(top_k_tokens)
        
        # Top-p (nucleus) sampling
        sorted_probs, sorted_indices = torch.sort(softmax_logits[i], descending=True)
        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
        nucleus_indices = sorted_indices[cumulative_probs <= 0.9]
        nucleus_size = len(nucleus_indices)
        
        # Sample from the nucleus
        if nucleus_size > 0:
            nucleus_probs = sorted_probs[:nucleus_size]
            nucleus_probs = nucleus_probs / nucleus_probs.sum()  # Renormalize probabilities
            nucleus_sample_idx = torch.multinomial(nucleus_probs, 1).item()
            nucleus_token_idx = nucleus_indices[nucleus_sample_idx].item()
            nucleus_token = tokenizer.decode(nucleus_token_idx)
        else:
            nucleus_token = tokenizer.decode(sorted_indices[0].item())
        # Get actual label
        label_token = tokenizer.decode(sample_labels[0, i+1].item()) if sample_labels[0, i+1].item() != -100 else "[MASKED]"
        # Print results
        print(f"Position {i}:")
        # print(f"  Top-k: 1.'{top1_token}' 2.'{top2_token}' 3.'{top3_token}'")
        print(f"  Top-k: {top_k_str}")
        print(f"  Top-p: nucleus size={nucleus_size} (p=0.9), sampled='{nucleus_token}'")
        print(f"  Label: '{label_token}'")

Decoded input:  revealed to be the “largest climatic and
Loss: 436.598267

Comparison - Top-k and Top-p Predictions vs Labels:
Position 0:
  Top-k: 1.' revealed' 2.' pub' 3.'Seg' 4.' patrol' 5.' blatant'
  Top-p: nucleus size=0 (p=0.9), sampled=' revealed'
  Label: ' to'
Position 1:
  Top-k: 1.' to' 2.' reps' 3.' Miche' 4.' Ferry' 5.' Bowl'
  Top-p: nucleus size=0 (p=0.9), sampled=' to'
  Label: ' be'
Position 2:
  Top-k: 1.' be' 2.' Merc' 3.' landscapes' 4.' onlook' 5.' Brother'
  Top-p: nucleus size=0 (p=0.9), sampled=' be'
  Label: ' the'
Position 3:
  Top-k: 1.' the' 2.' evil' 3.'groupon' 4.'Tap' 5.''re'
  Top-p: nucleus size=0 (p=0.9), sampled=' the'
  Label: ' �'
Position 4:
  Top-k: 1.' �' 2.' Mack' 3.' kickoff' 4.' Jag' 5.' negativity'
  Top-p: nucleus size=0 (p=0.9), sampled=' �'
  Label: '�'


In [20]:
# Add this to your current debugging cell in roformer_training.ipynb
if False:
    # Existing initialization checks
    print("Checking model initialization:")
    print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Track intermediate values through the model
    with torch.no_grad():
        # 1. Check embeddings output
        print("\n=== Embeddings Layer ===")
        embedded = model.backbone.embeddings(sample_input_ids)
        print(f"Embeddings output mean: {embedded.mean().item():.6f}")
        print(f"Embeddings output std: {embedded.std().item():.6f}")
        
        # 2. Track through each transformer layer
        x = embedded
        for i, layer in enumerate(model.backbone.layers):
            print(f"\n=== Transformer Layer {i} ===")
            
            # 2.1 Self-attention
            # Store original input for residual
            layer_input = x
            
            # Get attention outputs
            attn_output = layer.self_attn(
                q=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                k=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                v=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2)
            )
            
            print(f"Attention scores stats:")
            print(f"  Mean: {attn_output.mean().item():.6f}")
            print(f"  Std: {attn_output.std().item():.6f}")
            
            # 2.2 First residual + layer norm
            x = layer_input + layer.dropout1(attn_output)
            x = layer.ln1(x)
            print(f"After first layer norm:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # 2.3 FFN
            ffn_output = layer.ffn(x)
            print(f"FFN output stats:")
            print(f"  Mean: {ffn_output.mean().item():.6f}")
            print(f"  Std: {ffn_output.std().item():.6f}")
            
            # 2.4 Second residual + layer norm
            x = x + layer.dropout2(ffn_output)
            x = layer.ln2(x)
            print(f"Layer {i} final output:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # Check if output is close to input
            similarity = torch.cosine_similarity(layer_input.view(-1), x.view(-1), dim=0)
            print(f"  Cosine similarity with layer input: {similarity.item():.6f}")
        
        # 3. Final LM head
        print("\n=== LM Head Layer ===")
        logits = model.lm_head(x)
        print(f"Final logits stats:")
        print(f"  Mean: {logits.mean().item():.6f}")
        print(f"  Std: {logits.std().item():.6f}")
        
        # 4. Check weight tying
        print("\n=== Weight Tying Check ===")
        print(f"Embeddings weight sum: {model.backbone.embeddings.weight.sum().item():.6f}")
        print(f"LM head weight sum: {model.lm_head.weight.sum().item():.6f}")
        print(f"Are weights identical? {torch.allclose(model.backbone.embeddings.weight, model.lm_head.weight)}")
        
        # 5. Compare predictions with input
        predictions = torch.argmax(logits, dim=-1)
        print("\n=== Input vs Predictions ===")
        print("First 5 tokens:")
        for i in range(5):
            input_token = tokenizer.decode(sample_input_ids[0, i].item())
            pred_token = tokenizer.decode(predictions[0, i].item())
            print(f"Position {i}:")
            print(f"  Input: '{input_token}'")
            print(f"  Predicted: '{pred_token}'")
            print(f"  Token IDs - Input: {sample_input_ids[0, i].item()}, Predicted: {predictions[0, i].item()}")