In [None]:
import torch
from transformers import (
    GPT2Config, 
    GPT2LMHeadModel, 
    Trainer, 
    TrainingArguments,
)
from torch.utils.data import Dataset
import numpy as np
import json
from pathlib import Path
import os

#### ============= CREATE SAMPLE DATA FILES -> USE THE REAL ONES!=============

In [None]:
def create_sample_data_files():
    """Create sample pre-tokenized data files for testing"""
    
    # Create directories
    os.makedirs("data", exist_ok=True)
    
    # Sample vocabulary (token -> id mapping)
    vocab = {
        "<pad>": 0,
        "<bos>": 1,
        "<eos>": 2,
        "the": 3, "a": 4, "to": 5, "of": 6, "and": 7,
        "in": 8, "is": 9, "it": 10, "for": 11, "on": 12,
        "with": 13, "was": 14, "as": 15, "at": 16, "by": 17,
        "from": 18, "up": 19, "out": 20, "had": 21, "but": 22,
    }
    
    # Extend vocab to 100 tokens for demo
    for i in range(23, 100):
        vocab[f"token_{i}"] = i
    
    # Save vocabulary
    with open("data/vocab.json", "w") as f:
        json.dump(vocab, f, indent=2)
    
    # Generate random pre-tokenized sequences
    # Each sequence: [BOS, random tokens, EOS]
    np.random.seed(42)  # For reproducibility
    
    def generate_sequences(num_sequences, min_len=5, max_len=20):
        sequences = []
        for _ in range(num_sequences):
            seq_len = np.random.randint(min_len, max_len)
            # Random token IDs between 3-99 (avoiding special tokens 0,1,2)
            tokens = np.random.randint(3, 100, size=seq_len).tolist()
            # Add BOS at start and EOS at end
            sequence = [1] + tokens + [2]
            sequences.append(sequence)
        return sequences
    
    # Generate train, validation, test sequences
    train_sequences = generate_sequences(100)  # 100 sequences for training
    valid_sequences = generate_sequences(20)   # 20 for validation  
    test_sequences = generate_sequences(20)    # 20 for testing
    
    # Save to files (one sequence per line, space-separated integers)
    def save_sequences(sequences, filepath):
        with open(filepath, "w") as f:
            for seq in sequences:
                f.write(" ".join(map(str, seq)) + "\n")
    
    save_sequences(train_sequences, "data/train_ids.txt")
    save_sequences(valid_sequences, "data/valid_ids.txt")
    save_sequences(test_sequences, "data/test_ids.txt")
    
    print("✅ Created sample data files:")
    print("  - data/vocab.json (100 tokens)")
    print("  - data/train_ids.txt (100 sequences)")
    print("  - data/valid_ids.txt (20 sequences)")
    print("  - data/test_ids.txt (20 sequences)")
    
    return len(vocab)

# Create the sample files
VOCAB_SIZE = create_sample_data_files()

#### ============= LOAD DATA FROM FILES =============

In [None]:
def load_sequences_from_file(filepath):
    """Load pre-tokenized sequences from a file"""
    sequences = []
    with open(filepath, 'r') as f:
        for line in f:
            # Each line contains space-separated integers
            seq = [int(x) for x in line.strip().split()]
            sequences.append(seq)
    return sequences

# Load vocabulary info
with open('data/vocab.json', 'r') as f:
    vocab = json.load(f)
    VOCAB_SIZE = len(vocab)
    BOS_TOKEN_ID = vocab.get('<bos>', 1)
    EOS_TOKEN_ID = vocab.get('<eos>', 2)
    PAD_TOKEN_ID = vocab.get('<pad>', 0)

print(f"\n📊 Vocabulary size: {VOCAB_SIZE}")

print(f"Special tokens - BOS: {BOS_TOKEN_ID}, EOS: {EOS_TOKEN_ID}, PAD: {PAD_TOKEN_ID}")

# Load pre-tokenized sequences
train_sequences = load_sequences_from_file('data/train_ids.txt')
valid_sequences = load_sequences_from_file('data/valid_ids.txt')
test_sequences = load_sequences_from_file('data/test_ids.txt')

print(f"\n📚 Loaded sequences:")
print(f"  Training: {len(train_sequences)} sequences")
print(f"  Validation: {len(valid_sequences)} sequences")
print(f"  Test: {len(test_sequences)} sequences")

# Show example sequence
print(f"\nExample sequence (first training sequence):")
print(f"  Token IDs: {train_sequences[0][:10]}..." if len(train_sequences[0]) > 10 else f"  Token IDs: {train_sequences[0]}")

#### ============= DATASET CLASS =============

In [None]:
class PostTokenizedDataset(Dataset):
    """Dataset for pre-tokenized integer sequences"""
    
    def __init__(self, sequences, block_size=128):
        """
        Args:
            sequences: List of lists, each inner list is a sequence of token IDs
            block_size: Maximum sequence length (will chunk longer sequences)
        """
        self.block_size = block_size
        self.examples = []
        
        # Concatenate all sequences (they already have EOS tokens)
        all_tokens = []
        for seq in sequences:
            all_tokens.extend(seq)
        
        # Chunk into fixed-size blocks for language modeling
        # This is standard for GPT-2 training
        for i in range(0, len(all_tokens) - block_size + 1, block_size):
            chunk = all_tokens[i:i + block_size]
            if len(chunk) == block_size:  # Only keep full blocks
                self.examples.append(chunk)
        
        print(f"  Created {len(self.examples)} chunks of size {block_size} from {len(sequences)} sequences")
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        """
        For language modeling, we return the same sequence as both input and label.
        HuggingFace's GPT2LMHeadModel internally shifts the labels to predict next tokens.
        
        Internally, the model does:
        - Input: [A, B, C, D] 
        - Predicts: [B, C, D, E]
        - Loss computed on positions 1 to n (ignores position 0 of labels)
        """
        return {
            "input_ids": torch.tensor(self.examples[idx], dtype=torch.long),
            "labels": torch.tensor(self.examples[idx], dtype=torch.long)
        }

#### ============= DATA COLLATOR =============

In [None]:

class SimpleDataCollator:
    """Simple collator for batching post-tokenized sequences"""
#     Dataset outputs individual examples:
# [1, 23, 45, 67, 89, 12, 34, 56]  # Shape: [128]
# [1, 34, 56, 78, 90, 23, 45, 67]  # Shape: [128]
# [1, 67, 89, 12, 34, 56, 78, 90]  # Shape: [128]
# [1, 45, 67, 89, 12, 34, 56, 78]  # Shape: [128]

# ↓ Collator stacks them ↓

# Batch tensor for GPU/model:
# [[1, 23, 45, 67, 89, 12, 34, 56],
#  [1, 34, 56, 78, 90, 23, 45, 67],
#  [1, 67, 89, 12, 34, 56, 78, 90],
#  [1, 45, 67, 89, 12, 34, 56, 78]]  # Shape: [4, 128]
    def __call__(self, examples):
        # Stack all input_ids and labels
        input_ids = torch.stack([ex["input_ids"] for ex in examples])
        labels = torch.stack([ex["labels"] for ex in examples])
        
        return {
            "input_ids": input_ids,
            "labels": labels,
        }

#### ============= CONFIGURATION =============

In [None]:
MAX_LENGTH = 128  # Maximum sequence length for both model and data

print("\n🤖 Configuring model...")

# Small GPT-2 config for testing
config = GPT2Config(
    vocab_size=VOCAB_SIZE,
    n_positions=MAX_LENGTH,  # Maximum position embeddings
    n_ctx=MAX_LENGTH,        # Context size (same as n_positions)
    n_embd=128,              # Hidden size (very small for testing)
    n_layer=2,               # Number of layers (very small for testing)
    n_head=2,                # Number of attention heads
    bos_token_id=BOS_TOKEN_ID,
    eos_token_id=EOS_TOKEN_ID,
    pad_token_id=PAD_TOKEN_ID,
)

# Initialize model from scratch
model = GPT2LMHeadModel(config)
total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {total_params:,} ({total_params/1e6:.2f}M)")
print(f"Max sequence length: {MAX_LENGTH} tokens")

#### ============= PREPARE DATASETS =============

In [None]:

BLOCK_SIZE = MAX_LENGTH  # Use same size for data chunks!

print(f"\n📦 Preparing datasets with block size {BLOCK_SIZE}...")
train_dataset = PostTokenizedDataset(train_sequences, block_size=BLOCK_SIZE)
valid_dataset = PostTokenizedDataset(valid_sequences, block_size=BLOCK_SIZE)
test_dataset = PostTokenizedDataset(test_sequences, block_size=BLOCK_SIZE)

#### ============= TRAINING ARGUMENTS =============

In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2-pretokenized",
    overwrite_output_dir=True,
    
    # Training hyperparameters
    num_train_epochs=20,              # Few epochs for testing
    per_device_train_batch_size=4,   # Small batch for testing
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,   
    
    # Learning rate schedule
    learning_rate=5e-4,              
    warmup_steps=10,                 
    weight_decay=0.01,               
    
    # Evaluation and saving - using correct v4.55 syntax
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,              
    
    # Logging
    logging_dir="./logs",
    logging_steps=10,
    
    # Performance
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    dataloader_num_workers=0,        
    
    # Disable unwanted features
    push_to_hub=False,
    report_to="none",                
)

#### ============= TRAIN MODEL =============

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=SimpleDataCollator(),
)

trainer.train()

In [None]:
# ============= EVALUATE ON VALIDATION =============
print("\n Evaluating on validation set...")
eval_results = trainer.evaluate()
print(f"Validation loss: {eval_results['eval_loss']:.4f}")

# Calculate perplexity
import math
perplexity = math.exp(eval_results['eval_loss']) if eval_results['eval_loss'] < 20 else float("inf")
print(f"Validation perplexity: {perplexity:.2f}")

# ============= EVALUATE ON TEST =============
print("\n Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test loss: {test_results['eval_loss']:.4f}")

test_perplexity = math.exp(test_results['eval_loss']) if test_results['eval_loss'] < 20 else float("inf")
print(f"Test perplexity: {test_perplexity:.2f}")

In [None]:


# ============= SAVE MODEL =============
trainer.save_model("./final_model")
print("\n💾 Model saved to ./final_model")


# ============= SIMPLE GENERATION TEST =============
device = next(model.parameters()).device
print(f"Model is on device: {device}")

model.eval()
with torch.no_grad():
    # Start with BOS token
    input_ids = torch.tensor([[BOS_TOKEN_ID]], dtype=torch.long).to(device)
    
    # Generate 15 tokens
    max_length = 15
    for _ in range(max_length):
        outputs = model(input_ids)
        next_token_logits = outputs.logits[0, -1, :]
        
        # Sample from top-k tokens for more interesting generation
        top_k = 5
        top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
        probs = torch.softmax(top_k_logits, dim=-1)
        next_token = top_k_indices[torch.multinomial(probs, 1)].item()
        
        # Stop if EOS token
        if next_token == EOS_TOKEN_ID:
            break
            
        # Create new token on same device
        new_token = torch.tensor([[next_token]], dtype=torch.long).to(device)
        input_ids = torch.cat([input_ids, new_token], dim=1)
    
    generated = input_ids[0].cpu().tolist()
    print(f"Generated token IDs: {generated}")
    print(f"Length: {len(generated)} tokens")