# Data Preprocessing for Language Model Training

This notebook handles all data preparation steps:
1. Download BookCorpus dataset
2. Clean and preprocess text
3. Tokenize with GPT-2 tokenizer
4. Create train/validation splits
5. Save preprocessed data for training

**Run this notebook once before training to prepare the data.**

## 1. Setup and Configuration

In [None]:
import os
import sys
from pathlib import Path
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import numpy as np
from tqdm import tqdm
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Configuration
DATASET_NAME = "lucadiliello/bookcorpusopen"
MAX_SEQ_LENGTH = 512
VALIDATION_SPLIT = 0.05  # 5% for validation
MAX_SAMPLES = None  # Set to int to limit samples (e.g., 10000 for testing)
PACK_TO_MAX_LENGTH = True  # Pack tokens into fixed-length blocks (reduces padding)

# Output directory for preprocessed data
DATA_DIR = Path("data/preprocessed/v1")
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Cache directory (use scratch on Zaratan)
CACHE_DIR = os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface'))
print(f"Cache directory: {CACHE_DIR}")
print(f"Output directory: {DATA_DIR}")


## 2. Load BookCorpus Dataset


In [None]:
print("Loading BookCorpus dataset...")
print("This may take several minutes on first run (downloading ~7GB)...")

dataset = load_dataset(
    DATASET_NAME,
    split="train",
    cache_dir=CACHE_DIR
)

print(f"\nDataset loaded: {len(dataset):,} samples")
print(f"Features: {dataset.features}")
print(f"\nSample text (first 200 chars):")
print(dataset[0]['text'][:200])


## 3. Limit Samples (Optional - for testing)


In [None]:
if MAX_SAMPLES is not None and MAX_SAMPLES < len(dataset):
    print(f"Limiting to {MAX_SAMPLES:,} samples for testing...")
    dataset = dataset.select(range(MAX_SAMPLES))
    print(f"Dataset size after limiting: {len(dataset):,} samples")
else:
    print(f"Using full dataset: {len(dataset):,} samples")


## 4. Create Train/Validation Split


In [None]:
split_idx = int(len(dataset) * (1 - VALIDATION_SPLIT))
train_data = dataset.select(range(split_idx))
val_data = dataset.select(range(split_idx, len(dataset)))

print(f"Train samples: {len(train_data):,}")
print(f"Validation samples: {len(val_data):,}")
print(f"Validation split: {VALIDATION_SPLIT*100:.1f}%")


## 5. Load Tokenizer


In [None]:
print("Loading GPT-2 tokenizer...")
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2', cache_dir=CACHE_DIR)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have pad token
tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Tokenizer vocab size: {tokenizer.vocab_size:,}")
print(f"Max model length: {tokenizer.model_max_length}")

# Test tokenization
test_text = "Hello, this is a test sentence."
test_tokens = tokenizer(test_text, return_tensors='pt')
print(f"\nTest tokenization:")
print(f"  Text: {test_text}")
print(f"  Token IDs: {test_tokens['input_ids'][0].tolist()}")
print(f"  Decoded: {tokenizer.decode(test_tokens['input_ids'][0])}")


## 6. Tokenize Dataset


In [None]:
def tokenize_function(examples):
    """Tokenize text samples."""
    if PACK_TO_MAX_LENGTH:
        # Don't pad or truncate - we'll pack into blocks later
        return tokenizer(
            examples['text'],
            truncation=False,
            padding=False,
            return_attention_mask=False,
            return_tensors=None
        )
    else:
        # Standard tokenization with padding/truncation
        return tokenizer(
            examples['text'],
            truncation=True,
            padding='max_length',
            max_length=MAX_SEQ_LENGTH,
            return_tensors=None
        )

print("Tokenizing training data...")
train_tokenized = train_data.map(
    tokenize_function,
    batched=True,
    remove_columns=train_data.column_names,
    desc="Tokenizing train",
    num_proc=min(os.cpu_count() or 4, 8)  # Use multiple processes
)

print("\nTokenizing validation data...")
val_tokenized = val_data.map(
    tokenize_function,
    batched=True,
    remove_columns=val_data.column_names,
    desc="Tokenizing validation",
    num_proc=min(os.cpu_count() or 4, 8)
)

print(f"\nTrain tokenized: {len(train_tokenized):,} samples")
print(f"Val tokenized: {len(val_tokenized):,} samples")


## 7. Pack Tokens into Fixed-Length Blocks (Optional but Recommended)


In [None]:
if PACK_TO_MAX_LENGTH:
    block_size = MAX_SEQ_LENGTH
    
    def group_texts(examples):
        # Concatenate all texts
        import itertools
        concatenated = {k: list(itertools.chain.from_iterable(examples[k])) for k in examples.keys()}

        total_length = len(concatenated['input_ids'])
        # Trim to multiple of block_size
        total_length = (total_length // block_size) * block_size
        
        result = {}
        for k, t in concatenated.items():
            result[k] = [t[i:i + block_size] for i in range(0, total_length, block_size)]
        
        # Create attention masks (all ones, no padding inside blocks)
        result['attention_mask'] = [[1] * block_size for _ in range(len(result['input_ids']))]
        result['labels'] = [seq.copy() for seq in result['input_ids']]
        return result
    
    print("Packing training data into fixed-length blocks...")
    train_tokenized = train_tokenized.map(
        group_texts,
        batched=True,
        desc="Grouping train into blocks",
        num_proc=max(1, min(os.cpu_count() or 4, 8) // 2)
    )
    
    print("\nPacking validation data into fixed-length blocks...")
    val_tokenized = val_tokenized.map(
        group_texts,
        batched=True,
        desc="Grouping val into blocks",
        num_proc=max(1, min(os.cpu_count() or 4, 8) // 2)
    )
    
    print(f"\nAfter packing:")
    print(f"  Train blocks: {len(train_tokenized):,}")
    print(f"  Val blocks: {len(val_tokenized):,}")
else:
    print("Skipping token packing (using standard padding/truncation)")


## 8. Save Preprocessed Data


In [None]:
print("Saving preprocessed data...")

# Save as HuggingFace datasets (efficient format)
train_tokenized.save_to_disk(str(DATA_DIR / "train"))
val_tokenized.save_to_disk(str(DATA_DIR / "val"))

print(f"\n✓ Training data saved to: {DATA_DIR / 'train'}")
print(f"✓ Validation data saved to: {DATA_DIR / 'val'}")

# Save tokenizer info
tokenizer.save_pretrained(str(DATA_DIR / "tokenizer"))
print(f"✓ Tokenizer saved to: {DATA_DIR / 'tokenizer'}")

# Save metadata
import json
metadata = {
    "dataset_name": DATASET_NAME,
    "max_seq_length": MAX_SEQ_LENGTH,
    "validation_split": VALIDATION_SPLIT,
    "vocab_size": tokenizer.vocab_size,
    "train_samples": len(train_tokenized),
    "val_samples": len(val_tokenized),
    "pack_to_max_length": PACK_TO_MAX_LENGTH,
    "tokenizer_type": "gpt2",
    "block_size": MAX_SEQ_LENGTH
}

with open(DATA_DIR / "metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print(f"✓ Metadata saved to: {DATA_DIR / 'metadata.json'}")
print("\n" + "="*60)
print("Data preprocessing complete!")
print("="*60)
print(f"\nYou can now run training with:")
print(f"  sbatch scripts/train_zaratan.sh")


## 9. Verify Saved Data


In [None]:
from datasets import load_from_disk

# Load saved data to verify
train_loaded = load_from_disk(str(DATA_DIR / "train"))
val_loaded = load_from_disk(str(DATA_DIR / "val"))

print(f"Train samples loaded: {len(train_loaded):,}")
print(f"Val samples loaded: {len(val_loaded):,}")
print(f"\nSample train item:")
sample = train_loaded[0]
print(f"  Keys: {sample.keys()}")
print(f"  Input IDs length: {len(sample['input_ids'])}")
print(f"  Attention mask length: {len(sample['attention_mask'])}")
print(f"\nFirst 20 tokens: {sample['input_ids'][:20]}")
print(f"Decoded: {tokenizer.decode(sample['input_ids'][:20])}")
