In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from utils import TrainingConfig, Config
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")   # or your custom one

training_config = TrainingConfig()
config = Config(vocab_size=tokenizer.vocab_size,
    d_model=768, num_heads=12, ffn_dim=3072,
    num_layers=12, max_len=tokenizer.model_max_length )

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Load the raw text
ds = load_dataset("openwebtext", split="train", trust_remote_code=True)

Using device: cuda


Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [2]:
if False:
    ds = ds.select(range(100000))

In [3]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=training_config.max_len,
        padding=True,
        return_tensors="pt",
    ).to(device)

tokenized = ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized = tokenized.remove_columns(["attention_mask"])
tokenized

Map:   0%|          | 0/8013769 [00:00<?, ? examples/s]

In [7]:
# def group(batch):
#     # Flattens the input_ids and attention_mask into single lists
#     flat_ids = sum(batch["input_ids"], [])

#     num_of_complete_blocks = len(flat_ids) // config.max_seq_len
#     total = num_of_complete_blocks * config.max_seq_len
#     flat_ids = flat_ids[:total+1]

#     return {
#         "input_ids": [flat_ids[i:i+config.max_seq_len] for i in range(0, total, config.max_seq_len)],
#         "labels": [flat_ids[i+1:i+config.max_seq_len+1] for i in range(0, total, config.max_seq_len)]
#     }


# # lm_ds = tokenized.map(group, batched=True, batch_size=10000)
# # lm_ds

In [4]:
lm_ds = tokenized

In [5]:
from models import RoFormerEncoder, RoFormerForCausalLM
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

model_base = RoFormerEncoder(config)
model = RoFormerForCausalLM(model_base, config)

# Move model to device
model = model.to(device)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="roformer-base",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8, # Accumulate gradients over N steps
    #With gradient accumulation (gradient_accumulation_steps=8):
        # You split what would have been one batch into 8 smaller micro-batches
        # For each micro-batch, you:
        # Load 1/8th of the data into memory
        # Do a forward pass (storing 1/8th of the activations)
        # Do a backward pass (computing 1/8th of the gradients)
        # ACCUMULATE the gradients (don't update weights yet)
        # Clear the activations (but keep gradients)
    
    warmup_steps=10,
    logging_dir="logs",
    logging_steps=10,
    save_steps=10,
    save_total_limit=5,
    save_strategy="steps",
    save_safetensors=False,
    report_to="tensorboard",
    gradient_checkpointing=False,

    #With Gradient Checkpointing:
        # During the forward pass, only store activations at certain "checkpoints"
        # During backpropagation, RECOMPUTE the intermediate activations as needed
        # This means doing some forward computations twice, but using much less memory
    # Without checkpointing, you need to store activations for all 12 layers. With checkpointing, you might only store activations every few layers and recompute the rest during backprop.
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=lm_ds,
    data_collator=data_collator,
)

In [6]:
if True:
    trainer.train()


Step,Training Loss
10,425.1401
20,141.4481


KeyboardInterrupt: 

In [12]:
sample_batch = next(iter(trainer.get_train_dataloader()))
sample_batch['input_ids'].shape

torch.Size([1, 374])

In [11]:
if True:
    # After creating the model but before training
    print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Print input_ids
    print(f"Input IDs sample: {sample_input_ids[0, :10]}")  # Print first 10 input IDs of first batch
    print(f"Decoded input: {tokenizer.decode(sample_input_ids[0, :10])}")  # Decode the first 10 tokens
    
    # Forward pass
    outputs = model(sample_input_ids, labels=sample_labels)
    # print(f"\nSample batch statistics:")
    # print(f"Input shape: {sample_input_ids.shape}")
    # print(f"Labels shape: {sample_labels.shape}")
    print(f"Labels sample: {sample_labels[0, :10]}")  # Print first 10 labels of first batch
    print(f"Decoded labels: {tokenizer.decode([l.item() for l in sample_labels[0, :10] if l.item() != -100])}")  # Decode the first 10 labels, skipping masked tokens

    print(f"\nLoss: {outputs['loss'].item():.6f}")

    # print(f"\nLogits shape: {outputs['logits'].shape}")
    # # Print shapes of intermediate outputs
    sequence_length = sample_input_ids.size(1)
    # vocab_size = outputs['logits'].shape[-1]
    # print(f"Flattened logits shape: {outputs['logits'].view(batch_size * sequence_length, vocab_size).shape}")
    # print(f"Flattened labels shape: {sample_labels.view(batch_size * sequence_length).shape}")

    # print(f"Logits mean: {outputs['logits'].mean().item():.6f}")
    # print(f"Logits std: {outputs['logits'].std().item():.6f}")
    # print(f"Logits sample: {outputs['logits'][0, 0, :5]}")  # Print first 5 logits of first token
    
    # Get predicted classes from logits
    predictions = torch.argmax(outputs['logits'], dim=-1)
    print(f"\nPredicted classes sample: {predictions[0, :10]}")  # Print first 10 predicted tokens
    print(f"Comparison - Predictions vs Labels:")
    for i in range(min(5, sequence_length)):
        pred_token = tokenizer.decode(predictions[0, i].item())
        label_token = tokenizer.decode(sample_labels[0, i+1].item()) if sample_labels[0, i+1].item() != -100 else "[MASKED]"
        print(f"Position {i}: Predicted '{pred_token}' | Label '{label_token}'")

Embedding weight mean: 0.000124
Embedding weight std: 1.000025
Input IDs sample: tensor([  40,  447,  247,   76,  407, 3375,  546,  406, 4720,   82],
       device='cuda:0')
Decoded input: I’m not talking about LEOs
Labels sample: tensor([  40,  447,  247,   76,  407, 3375,  546,  406, 4720,   82],
       device='cuda:0')
Decoded labels: I’m not talking about LEOs

Loss: 483.990692

Predicted classes sample: tensor([  40,  447,  247,   76,  407, 3375,  546,  406, 4720,   82],
       device='cuda:0')
Comparison - Predictions vs Labels:
Position 0: Predicted 'I' | Label '�'
Position 1: Predicted '�' | Label '�'
Position 2: Predicted '�' | Label 'm'
Position 3: Predicted 'm' | Label ' not'
Position 4: Predicted ' not' | Label ' talking'


In [9]:
# Add this to your current debugging cell in roformer_training.ipynb
if True:
    # Existing initialization checks
    print("Checking model initialization:")
    print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Track intermediate values through the model
    with torch.no_grad():
        # 1. Check embeddings output
        print("\n=== Embeddings Layer ===")
        embedded = model.backbone.embeddings(sample_input_ids)
        print(f"Embeddings output mean: {embedded.mean().item():.6f}")
        print(f"Embeddings output std: {embedded.std().item():.6f}")
        
        # 2. Track through each transformer layer
        x = embedded
        for i, layer in enumerate(model.backbone.layers):
            print(f"\n=== Transformer Layer {i} ===")
            
            # 2.1 Self-attention
            # Store original input for residual
            layer_input = x
            
            # Get attention outputs
            attn_output = layer.self_attn(
                q=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                k=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                v=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2)
            )
            
            print(f"Attention scores stats:")
            print(f"  Mean: {attn_output.mean().item():.6f}")
            print(f"  Std: {attn_output.std().item():.6f}")
            
            # 2.2 First residual + layer norm
            x = layer_input + layer.dropout1(attn_output)
            x = layer.ln1(x)
            print(f"After first layer norm:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # 2.3 FFN
            ffn_output = layer.ffn(x)
            print(f"FFN output stats:")
            print(f"  Mean: {ffn_output.mean().item():.6f}")
            print(f"  Std: {ffn_output.std().item():.6f}")
            
            # 2.4 Second residual + layer norm
            x = x + layer.dropout2(ffn_output)
            x = layer.ln2(x)
            print(f"Layer {i} final output:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # Check if output is close to input
            similarity = torch.cosine_similarity(layer_input.view(-1), x.view(-1), dim=0)
            print(f"  Cosine similarity with layer input: {similarity.item():.6f}")
        
        # 3. Final LM head
        print("\n=== LM Head Layer ===")
        logits = model.lm_head(x)
        print(f"Final logits stats:")
        print(f"  Mean: {logits.mean().item():.6f}")
        print(f"  Std: {logits.std().item():.6f}")
        
        # 4. Check weight tying
        print("\n=== Weight Tying Check ===")
        print(f"Embeddings weight sum: {model.backbone.embeddings.weight.sum().item():.6f}")
        print(f"LM head weight sum: {model.lm_head.weight.sum().item():.6f}")
        print(f"Are weights identical? {torch.allclose(model.backbone.embeddings.weight, model.lm_head.weight)}")
        
        # 5. Compare predictions with input
        predictions = torch.argmax(logits, dim=-1)
        print("\n=== Input vs Predictions ===")
        print("First 5 tokens:")
        for i in range(5):
            input_token = tokenizer.decode(sample_input_ids[0, i].item())
            pred_token = tokenizer.decode(predictions[0, i].item())
            print(f"Position {i}:")
            print(f"  Input: '{input_token}'")
            print(f"  Predicted: '{pred_token}'")
            print(f"  Token IDs - Input: {sample_input_ids[0, i].item()}, Predicted: {predictions[0, i].item()}")

Checking model initialization:
Embedding weight mean: -0.000035
Embedding weight std: 0.999916

=== Embeddings Layer ===
Embeddings output mean: -0.001582
Embeddings output std: 0.998326

=== Transformer Layer 0 ===
Attention scores stats:
  Mean: -0.003008
  Std: 0.066774
After first layer norm:
  Mean: 0.000000
  Std: 0.999995
FFN output stats:
  Mean: -0.012456
  Std: 0.233869
Layer 0 final output:
  Mean: 0.000000
  Std: 0.999995


  Cosine similarity with layer input: 0.967763

=== Transformer Layer 1 ===
Attention scores stats:
  Mean: -0.000862
  Std: 0.080982
After first layer norm:
  Mean: -0.000000
  Std: 0.999995
FFN output stats:
  Mean: 0.012353
  Std: 0.236669
Layer 1 final output:
  Mean: -0.000000
  Std: 0.999995
  Cosine similarity with layer input: 0.967129

=== Transformer Layer 2 ===
Attention scores stats:
  Mean: 0.000418
  Std: 0.097013
After first layer norm:
  Mean: -0.000000
  Std: 0.999995
FFN output stats:
  Mean: -0.001413
  Std: 0.234197
Layer 2 final output:
  Mean: 0.000000
  Std: 0.999995
  Cosine similarity with layer input: 0.965975

=== Transformer Layer 3 ===
Attention scores stats:
  Mean: -0.002001
  Std: 0.106930
After first layer norm:
  Mean: 0.000000
  Std: 0.999995
FFN output stats:
  Mean: 0.001594
  Std: 0.236986
Layer 3 final output:
  Mean: -0.000000
  Std: 0.999995
  Cosine similarity with layer input: 0.965236

=== Transformer Layer 4 ===
Attention scores stats:
  Mea