In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from utils import TrainingConfig, Config
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")   # or your custom one

training_config = TrainingConfig()
config = Config(vocab_size=tokenizer.vocab_size,
d_model=768, num_heads=12, ffn_dim=3072,
num_layers=12, max_seq_len=tokenizer.model_max_length )

if False:
    # Create a config dictionary for the model
    config_dict = {k: getattr(config, k) for k in vars(config) 
                if not k.startswith('_') and not callable(getattr(config, k))}

    # Save the config as JSON
    import os
    import json

    os.makedirs(savepath, exist_ok=True)
    with open(os.path.join(savepath, "config.json"), "w") as f:
        json.dump(config_dict, f, indent=2)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

savepath = "/home/chrisobrien/dev/transformer-examples/models/roformer-base"

Using device: cuda


In [2]:
# 1. Load the raw text
ds = load_dataset("openwebtext", split="train", trust_remote_code=True)

if False:
    ds = ds.select(range(100000))

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [3]:
if False:
    tokenizer.pad_token = tokenizer.eos_token

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            max_length=config.max_seq_len,
            padding=True,
            return_tensors="pt",
        ).to(device)

    tokenized = ds.map(tokenize, batched=True, remove_columns=["text"])
    tokenized = tokenized.remove_columns(["attention_mask"])

In [4]:
# Load the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Load the tokenized dataset from disk
from datasets import load_from_disk

dataset_name = "gpt2_tokenized_openwebtext"
try:
    print("Loading tokenized dataset from disk...")
    tokenized = load_from_disk(dataset_name)
    print(f"Successfully loaded tokenized dataset with {len(tokenized)} examples")
except FileNotFoundError:
    print(f"Dataset not found at {dataset_name}. Please make sure you've saved the tokenized dataset first.")


Loading tokenized dataset from disk...


Loading dataset from disk:   0%|          | 0/66 [00:00<?, ?it/s]

Successfully loaded tokenized dataset with 8013769 examples


In [5]:
tokenizer.decode(tokenized[0]["input_ids"])

'Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated the area, saying it was concerned about security.\n\nThe decision left CNN Chief Medical Correspondent Sanjay Gupta as the only doctor at the hospital to get the patients through the night.\n\nCNN initially reported, based on conversations with some of the doctors, that the United Nations ordered the Belgian First Aid and Support Team to evacuate. However, Belgian Chief Coordinator Geert Gijs, a doctor who was at the hospital with 60 Belgian medical personnel, said it was his decision to pull the team out for the night. Gijs said he requested U.N. security personnel to staff the hospital overnight, but was told that peacekeepers would only be able to evacuate the team.\n\nHe said it was a "tough decision" but that he accepted the U.N. offer to evacuate after a Canadian medical team, also

In [6]:
if False:
    # Save the tokenized dataset to disk
    dataset_name = "gpt2_tokenized_openwebtext"
    tokenized.save_to_disk(dataset_name)

In [7]:
if False:
    # Upload to Hugging Face Hub
    # You'll need to be logged in to Hugging Face
    from huggingface_hub import login

    # Login to Hugging Face (you'll need to run this once and enter your token)
    # Uncomment the line below when you're ready to login
    login("hf_xxxx")

In [8]:
if False:
    # Save the tokenized dataset to disk
    dataset_name = "gpt2_tokenized_openwebtext"
    username = "chrisjob1021"

    # Upload to Hugging Face Hub
    # You'll need to be logged in to Hugging Face
    from huggingface_hub import HfApi

    # Initialize the Hugging Face API
    api = HfApi()

    # Upload the dataset to the Hub
    # Replace "your-username/tokenized-openwebtext" with your desired repository name
    try:
        api.create_repo(
            repo_id=username + "/" + dataset_name,
            repo_type="dataset",
            exist_ok=True
        )
        
        api.upload_folder(
            folder_path=dataset_name,
            repo_id=username + "/" + dataset_name,
            repo_type="dataset"
        )
        
        print("Dataset successfully uploaded to Hugging Face Hub!")
    except Exception as e:
        print(f"Error uploading dataset: {e}")
        print("You may need to login first with `login()` or check your permissions.")


In [9]:
# Initialize the model with random weights
if False:
    model_base = RoFormerEncoder(config)
    model = RoFormerForCausalLM(model_base, config)

In [10]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from models import RoFormerForCausalLM
import torch

model = RoFormerForCausalLM.from_pretrained(savepath)
model = model.to(device)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    output_dir=savepath,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=8, # Accumulate gradients over N steps
    #With gradient accumulation (gradient_accumulation_steps=8):
        # You split what would have been one batch into 8 smaller micro-batches
        # For each micro-batch, you:
        # Load 1/8th of the data into memory
        # Do a forward pass (storing 1/8th of the activations)
        # Do a backward pass (computing 1/8th of the gradients)
        # ACCUMULATE the gradients (don't update weights yet)
        # Clear the activations (but keep gradients)
    
    warmup_steps=10,
    logging_dir="logs",
    logging_steps=10,
    save_steps=10,
    save_total_limit=100,
    save_strategy="steps",
    save_safetensors=False,
    report_to="tensorboard",
    gradient_checkpointing=False,

    #With Gradient Checkpointing:
        # During the forward pass, only store activations at certain "checkpoints"
        # During backpropagation, RECOMPUTE the intermediate activations as needed
        # This means doing some forward computations twice, but using much less memory
    # Without checkpointing, you need to store activations for all 12 layers. With checkpointing, you might only store activations every few layers and recompute the rest during backprop.
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    data_collator=data_collator,
)

Loading checkpoint: checkpoint-100


In [None]:
if True:
    trainer.train(resume_from_checkpoint=savepath + "/checkpoint-100")


In [12]:
sample_batch = next(iter(trainer.get_train_dataloader()))
sample_batch['input_ids'].shape

torch.Size([16, 1024])

In [13]:
if True:
    # After creating the model but before training
    print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Print input_ids
    print(f"Input IDs sample: {sample_input_ids[0, :10]}")  # Print first 10 input IDs of first batch
    print(f"Decoded input: {tokenizer.decode(sample_input_ids[0, :10])}")  # Decode the first 10 tokens
    
    # Forward pass
    outputs = model(sample_input_ids, labels=sample_labels)
    # print(f"\nSample batch statistics:")
    # print(f"Input shape: {sample_input_ids.shape}")
    # print(f"Labels shape: {sample_labels.shape}")
    print(f"Labels sample: {sample_labels[0, :10]}")  # Print first 10 labels of first batch
    print(f"Decoded labels: {tokenizer.decode([l.item() for l in sample_labels[0, :10] if l.item() != -100])}")  # Decode the first 10 labels, skipping masked tokens

    print(f"\nLoss: {outputs['loss'].item():.6f}")

    # print(f"\nLogits shape: {outputs['logits'].shape}")
    # # Print shapes of intermediate outputs
    sequence_length = sample_input_ids.size(1)
    # vocab_size = outputs['logits'].shape[-1]
    # print(f"Flattened logits shape: {outputs['logits'].view(batch_size * sequence_length, vocab_size).shape}")
    # print(f"Flattened labels shape: {sample_labels.view(batch_size * sequence_length).shape}")

    # print(f"Logits mean: {outputs['logits'].mean().item():.6f}")
    # print(f"Logits std: {outputs['logits'].std().item():.6f}")
    # print(f"Logits sample: {outputs['logits'][0, 0, :5]}")  # Print first 5 logits of first token
    
    # Get predicted classes from logits
    predictions = torch.argmax(outputs['logits'], dim=-1)
    print(f"\nPredicted classes sample: {predictions[0, :10]}")  # Print first 10 predicted tokens
    print(f"Comparison - Predictions vs Labels:")
    for i in range(min(5, sequence_length)):
        pred_token = tokenizer.decode(predictions[0, i].item())
        label_token = tokenizer.decode(sample_labels[0, i+1].item()) if sample_labels[0, i+1].item() != -100 else "[MASKED]"
        print(f"Position {i}: Predicted '{pred_token}' | Label '{label_token}'")

Embedding weight mean: 0.000126
Embedding weight std: 0.999865
Input IDs sample: tensor([  464,  4094,  6443,   329,  8550,   290,   262, 11779,  6486,   287],
       device='cuda:0')
Decoded input: The biggest opportunities for bitcoin and the blockchain lie in
Labels sample: tensor([  464,  4094,  6443,   329,  8550,   290,   262, 11779,  6486,   287],
       device='cuda:0')
Decoded labels: The biggest opportunities for bitcoin and the blockchain lie in

Loss: 47.813210

Predicted classes sample: tensor([ 9587,   284, 43589, 19746, 33877,   379,  4606, 49515, 27199,   262],
       device='cuda:0')
Comparison - Predictions vs Labels:
Position 0: Predicted 'erk' | Label ' biggest'
Position 1: Predicted ' to' | Label ' opportunities'
Position 2: Predicted 'aganda' | Label ' for'
Position 3: Predicted 'Content' | Label ' bitcoin'
Position 4: Predicted ' Latinos' | Label ' and'


In [14]:
# Add this to your current debugging cell in roformer_training.ipynb
if False:
    # Existing initialization checks
    print("Checking model initialization:")
    print(f"Embedding weight mean: {model.backbone.embeddings.weight.mean().item():.6f}")
    print(f"Embedding weight std: {model.backbone.embeddings.weight.std().item():.6f}")

    # Sample a small batch
    sample_batch = next(iter(trainer.get_train_dataloader()))
    sample_input_ids = sample_batch['input_ids'].to(device)
    sample_labels = sample_batch['labels'].to(device)
    
    # Track intermediate values through the model
    with torch.no_grad():
        # 1. Check embeddings output
        print("\n=== Embeddings Layer ===")
        embedded = model.backbone.embeddings(sample_input_ids)
        print(f"Embeddings output mean: {embedded.mean().item():.6f}")
        print(f"Embeddings output std: {embedded.std().item():.6f}")
        
        # 2. Track through each transformer layer
        x = embedded
        for i, layer in enumerate(model.backbone.layers):
            print(f"\n=== Transformer Layer {i} ===")
            
            # 2.1 Self-attention
            # Store original input for residual
            layer_input = x
            
            # Get attention outputs
            attn_output = layer.self_attn(
                q=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                k=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2),
                v=x.view(x.size(0), x.size(1), layer.config.num_heads, layer.config.per_head_dim).transpose(1, 2)
            )
            
            print(f"Attention scores stats:")
            print(f"  Mean: {attn_output.mean().item():.6f}")
            print(f"  Std: {attn_output.std().item():.6f}")
            
            # 2.2 First residual + layer norm
            x = layer_input + layer.dropout1(attn_output)
            x = layer.ln1(x)
            print(f"After first layer norm:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # 2.3 FFN
            ffn_output = layer.ffn(x)
            print(f"FFN output stats:")
            print(f"  Mean: {ffn_output.mean().item():.6f}")
            print(f"  Std: {ffn_output.std().item():.6f}")
            
            # 2.4 Second residual + layer norm
            x = x + layer.dropout2(ffn_output)
            x = layer.ln2(x)
            print(f"Layer {i} final output:")
            print(f"  Mean: {x.mean().item():.6f}")
            print(f"  Std: {x.std().item():.6f}")
            
            # Check if output is close to input
            similarity = torch.cosine_similarity(layer_input.view(-1), x.view(-1), dim=0)
            print(f"  Cosine similarity with layer input: {similarity.item():.6f}")
        
        # 3. Final LM head
        print("\n=== LM Head Layer ===")
        logits = model.lm_head(x)
        print(f"Final logits stats:")
        print(f"  Mean: {logits.mean().item():.6f}")
        print(f"  Std: {logits.std().item():.6f}")
        
        # 4. Check weight tying
        print("\n=== Weight Tying Check ===")
        print(f"Embeddings weight sum: {model.backbone.embeddings.weight.sum().item():.6f}")
        print(f"LM head weight sum: {model.lm_head.weight.sum().item():.6f}")
        print(f"Are weights identical? {torch.allclose(model.backbone.embeddings.weight, model.lm_head.weight)}")
        
        # 5. Compare predictions with input
        predictions = torch.argmax(logits, dim=-1)
        print("\n=== Input vs Predictions ===")
        print("First 5 tokens:")
        for i in range(5):
            input_token = tokenizer.decode(sample_input_ids[0, i].item())
            pred_token = tokenizer.decode(predictions[0, i].item())
            print(f"Position {i}:")
            print(f"  Input: '{input_token}'")
            print(f"  Predicted: '{pred_token}'")
            print(f"  Token IDs - Input: {sample_input_ids[0, i].item()}, Predicted: {predictions[0, i].item()}")