In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from utils import TrainingConfig, Config

tokenizer = AutoTokenizer.from_pretrained("gpt2")   # or your custom one

training_config = TrainingConfig()
config = Config(vocab_size=tokenizer.vocab_size,
    d_model=768, num_heads=12, ffn_dim=3072,
    num_layers=12, )

# 1. Load the raw text
ds = load_dataset("openwebtext", split="train", trust_remote_code=True)

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [2]:
if True:
    ds = ds.select(range(1000))

In [3]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(
        batch["text"],
        # truncation=False,
        # max_length=training_config.max_len,
        # padding=False,
        # return_tensors="pt",
    )

tokenized = ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized = tokenized.remove_columns(["attention_mask"])
tokenized

Dataset({
    features: ['input_ids'],
    num_rows: 1000
})

In [4]:
def group(batch):
    # Flattens the input_ids and attention_mask into single lists
    flat_ids = sum(batch["input_ids"], [])

    num_of_complete_blocks = len(flat_ids) // config.max_seq_len
    total = num_of_complete_blocks * config.max_seq_len
    flat_ids = flat_ids[:total+1]

    return {
        "input_ids": [flat_ids[i:i+config.max_seq_len] for i in range(0, total, config.max_seq_len)],
        "labels": [flat_ids[i+1:i+config.max_seq_len+1] for i in range(0, total, config.max_seq_len)]
    }


lm_ds = tokenized.map(group, batched=True, batch_size=10000)
lm_ds

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 1099
})

In [5]:
from models import RoFormerEncoder, RoFormerForCausalLM
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

model_base = RoFormerEncoder(config)
model = RoFormerForCausalLM(model_base, config)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to device
model = model.to(device)

# data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="roformer-base",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=10,
    logging_dir="logs",
    logging_steps=10,
    save_steps=10,
    save_total_limit=5,
    save_strategy="steps",
    save_safetensors=False,
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=lm_ds,
    # data_collator=data_collator,
)

Using device: cuda


In [6]:
print(next(model.parameters()).device)  # Should print 'cuda:0' or similar

cuda:0


In [7]:
if False:
    trainer.train()


In [8]:
if True:
    # After creating the model but before training
    print("Checking model initialization:")
    print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Print input_ids
    print(f"\nInput IDs sample: {sample_input_ids[0, :10]}")  # Print first 10 input IDs of first batch
    print(f"Decoded input: {tokenizer.decode(sample_input_ids[0, :10])}")  # Decode the first 10 tokens
    
    # Forward pass
    outputs = model(sample_input_ids, labels=sample_labels)
    # print(f"\nSample batch statistics:")
    # print(f"Input shape: {sample_input_ids.shape}")
    # print(f"Labels shape: {sample_labels.shape}")
    print(f"\nLabels sample: {sample_labels[0, :10]}")  # Print first 10 labels of first batch
    print(f"Decoded labels: {tokenizer.decode([l.item() for l in sample_labels[0, :10] if l.item() != -100])}")  # Decode the first 10 labels, skipping masked tokens

    print(f"\nLoss: {outputs['loss'].item():.6f}")

    print(f"\nLogits shape: {outputs['logits'].shape}")
    
    # Print shapes of intermediate outputs
    batch_size, sequence_length = sample_input_ids.shape
    vocab_size = outputs['logits'].shape[-1]
    print(f"Flattened logits shape: {outputs['logits'].view(batch_size * sequence_length, vocab_size).shape}")
    print(f"Flattened labels shape: {sample_labels.view(batch_size * sequence_length).shape}")

    print(f"Logits mean: {outputs['logits'].mean().item():.6f}")
    print(f"Logits std: {outputs['logits'].std().item():.6f}")
    print(f"Logits sample: {outputs['logits'][0, 0, :5]}")  # Print first 5 logits of first token
    
    # Get predicted classes from logits
    predictions = torch.argmax(outputs['logits'], dim=-1)
    print(f"\nPredicted classes sample: {predictions[0, :10]}")  # Print first 10 predicted tokens
    print(f"Comparison - Predictions vs Labels:")
    for i in range(min(5, sequence_length)):
        pred_token = tokenizer.decode(predictions[0, i].item())
        label_token = tokenizer.decode(sample_labels[0, i].item()) if sample_labels[0, i].item() != -100 else "[MASKED]"
        print(f"Position {i}: Predicted '{pred_token}' | Label '{label_token}'")

Checking model initialization:
Embedding weight mean: 0.000213
Embedding weight std: 1.000068

Input IDs sample: tensor([  262,  1398,   286,  8233, 11663,  4497,   329,   262,  5876,   287],
       device='cuda:0')
Decoded input:  the class of academic managers responsible for the trouble in

Labels sample: tensor([ 1398,   286,  8233, 11663,  4497,   329,   262,  5876,   287,   262],
       device='cuda:0')
Decoded labels:  class of academic managers responsible for the trouble in the

Loss: 497.880768

Logits shape: torch.Size([4, 1024, 50257])
Flattened logits shape: torch.Size([4096, 50257])
Flattened labels shape: torch.Size([4096])
Logits mean: 0.115224
Logits std: 27.812685
Logits sample: tensor([-13.6797, -12.5401,  -1.9143,  16.7642,  44.0470], device='cuda:0',
       grad_fn=<SliceBackward0>)

Predicted classes sample: tensor([  262,  1398,   286,  8233, 11663,  4497,   329,   262,  5876,   287],
       device='cuda:0')
Comparison - Predictions vs Labels:
Position 0: Predict

In [9]:
# Add this to your current debugging cell in roformer_training.ipynb
if True:
    # Existing initialization checks
    print("Checking model initialization:")
    print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Track intermediate values through the model
    with torch.no_grad():
        # 1. Check embeddings output
        print("\n=== Embeddings Layer ===")
        embedded = model.backbone.embeddings(sample_input_ids)
        print(f"Embeddings output mean: {embedded.mean().item():.6f}")
        print(f"Embeddings output std: {embedded.std().item():.6f}")
        
        # 2. Track through each transformer layer
        x = embedded
        for i, layer in enumerate(model.backbone.layers):
            print(f"\n=== Transformer Layer {i} ===")
            
            # 2.1 Self-attention
            # Store original input for residual
            layer_input = x
            
            # Get attention outputs
            attn_output = layer.self_attn(
                q=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                k=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                v=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2)
            )
            
            print(f"Attention scores stats:")
            print(f"  Mean: {attn_output.mean().item():.6f}")
            print(f"  Std: {attn_output.std().item():.6f}")
            
            # 2.2 First residual + layer norm
            x = layer_input + layer.dropout1(attn_output)
            x = layer.ln1(x)
            print(f"After first layer norm:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # 2.3 FFN
            ffn_output = layer.ffn(x)
            print(f"FFN output stats:")
            print(f"  Mean: {ffn_output.mean().item():.6f}")
            print(f"  Std: {ffn_output.std().item():.6f}")
            
            # 2.4 Second residual + layer norm
            x = x + layer.dropout2(ffn_output)
            x = layer.ln2(x)
            print(f"Layer {i} final output:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # Check if output is close to input
            similarity = torch.cosine_similarity(layer_input.view(-1), x.view(-1), dim=0)
            print(f"  Cosine similarity with layer input: {similarity.item():.6f}")
        
        # 3. Final LM head
        print("\n=== LM Head Layer ===")
        logits = model.lm_head(x)
        print(f"Final logits stats:")
        print(f"  Mean: {logits.mean().item():.6f}")
        print(f"  Std: {logits.std().item():.6f}")
        
        # 4. Check weight tying
        print("\n=== Weight Tying Check ===")
        print(f"Embeddings weight sum: {model.backbone.embeddings.weight.sum().item():.6f}")
        print(f"LM head weight sum: {model.lm_head.weight.sum().item():.6f}")
        print(f"Are weights identical? {torch.allclose(model.backbone.embeddings.weight, model.lm_head.weight)}")
        
        # 5. Compare predictions with input
        predictions = torch.argmax(logits, dim=-1)
        print("\n=== Input vs Predictions ===")
        print("First 5 tokens:")
        for i in range(5):
            input_token = tokenizer.decode(sample_input_ids[0, i].item())
            pred_token = tokenizer.decode(predictions[0, i].item())
            print(f"Position {i}:")
            print(f"  Input: '{input_token}'")
            print(f"  Predicted: '{pred_token}'")
            print(f"  Token IDs - Input: {sample_input_ids[0, i].item()}, Predicted: {predictions[0, i].item()}")

Checking model initialization:
Embedding weight mean: 0.000213
Embedding weight std: 1.000068

=== Embeddings Layer ===
Embeddings output mean: 0.004081
Embeddings output std: 0.996310

=== Transformer Layer 0 ===
Attention scores stats:
  Mean: -0.000157
  Std: 0.024678
After first layer norm:
  Mean: -0.000000
  Std: 0.999995
FFN output stats:
  Mean: 0.004629
  Std: 0.231079
Layer 0 final output:
  Mean: -0.000000
  Std: 0.999995
  Cosine similarity with layer input: 0.970191

=== Transformer Layer 1 ===
Attention scores stats:
  Mean: 0.001262
  Std: 0.027172
After first layer norm:
  Mean: 0.000000
  Std: 0.999995
FFN output stats:
  Mean: 0.007538
  Std: 0.238840
Layer 1 final output:
  Mean: -0.000000
  Std: 0.999995
  Cosine similarity with layer input: 0.969486

=== Transformer Layer 2 ===
Attention scores stats:
  Mean: -0.000579
  Std: 0.029659
After first layer norm:
  Mean: -0.000000
  Std: 0.999995
FFN output stats:
  Mean: -0.001685
  Std: 0.234047
Layer 2 final output:
