# Hamletmachine LLM Training on Google Colab

This notebook trains the hamletmachine language model using Google Colab's free GPU.

## Setup Instructions

1. **Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator ‚Üí GPU (T4)
2. **Mount Google Drive** (optional, for saving checkpoints): Run the mount cell below
3. **Upload your project**: Either clone from GitHub or upload the project folder
4. **Run all cells** in order

## Notes
- Free tier: ~9-12 hour sessions, may disconnect
- Checkpoints are saved to Google Drive (if mounted) or Colab storage
- Training progress is logged to TensorBoard


## 1. Setup Environment

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è  No GPU detected! Please enable GPU: Runtime ‚Üí Change runtime type ‚Üí GPU")

In [None]:
# Mount Google Drive (optional - for saving checkpoints)
from google.colab import drive
drive.mount('/content/drive')

# Set checkpoint directory (change path as needed)
CHECKPOINT_DIR = '/content/drive/MyDrive/hamletmachine/checkpoints'
import os
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print(f"Checkpoints will be saved to: {CHECKPOINT_DIR}")

In [None]:
# Option 1: Clone from GitHub (if your repo is on GitHub)
# !git clone https://github.com/yourusername/hamletmachine.git
# %cd hamletmachine

# Option 2: Upload project folder manually
# 1. Click folder icon on left sidebar
# 2. Upload your project folder
# 3. Uncomment and adjust path below:

# %cd /content/hamletmachine  # Adjust path if needed

# For now, we'll work in /content directory
%cd /content

## 2. Install Dependencies

In [None]:
# Install project dependencies
!pip install -q transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0 tokenizers>=0.15.0
!pip install -q torch>=2.1.0 numpy>=1.24.0 pandas>=2.0.0 pyyaml>=6.0
!pip install -q tensorboard>=2.15.0 tqdm>=4.66.0

# Optional: Install WandB for experiment tracking
# !pip install -q wandb

print("‚úÖ Dependencies installed!")

In [None]:
# Install the project package (if using GitHub clone)
# !pip install -e .

# Or add to Python path if uploaded manually
import sys
sys.path.insert(0, '/content/hamletmachine')

print("‚úÖ Project added to Python path!")

## 3. Upload Data

In [None]:
# Option 1: Upload processed dataset files
# Use the file browser on the left to upload:
# - data/processed/train.jsonl
# - data/processed/validation.jsonl
# - data/processed/test.jsonl

# Option 2: Process data on Colab (if you uploaded raw training materials)
# from hamletmachine.data.pipeline import DataPipeline
# pipeline = DataPipeline(config_path='configs/data_config.yaml')
# pipeline.run()

# Verify data files exist
import os
data_dir = '/content/hamletmachine/data/processed'
if os.path.exists(data_dir):
    files = os.listdir(data_dir)
    print(f"Data files found: {files}")
else:
    print(f"‚ö†Ô∏è  Data directory not found: {data_dir}")
    print("Please upload your processed dataset files.")

## 4. Configure Training

In [None]:
# Load or create training configuration
import yaml
from pathlib import Path

# Try to load existing config
config_path = '/content/hamletmachine/configs/train_config.yaml'
if not os.path.exists(config_path):
    # Create default config for Colab
    config = {
        'model': {
            'architecture': 'gpt2',  # Start with small model
        },
        'training': {
            'output_dir': CHECKPOINT_DIR if 'CHECKPOINT_DIR' in globals() else '/content/models/checkpoints',
            'num_train_epochs': 3,
            'per_device_train_batch_size': 4,  # Adjust based on GPU memory
            'per_device_eval_batch_size': 4,
            'gradient_accumulation_steps': 4,
            'learning_rate': 5.0e-5,
            'warmup_steps': 100,
            'logging_steps': 10,
            'save_steps': 500,
            'eval_steps': 500,
            'save_total_limit': 3,
            'fp16': True,  # Enable for T4 GPU
        },
        'data': {
            'train_file': '/content/hamletmachine/data/processed/train.jsonl',
            'validation_file': '/content/hamletmachine/data/processed/validation.jsonl',
            'max_seq_length': 1024,
        },
        'tokenizer': {
            'tokenizer_name': 'gpt2',
        },
        'logging': {
            'logger': 'tensorboard',
            'logging_dir': '/content/logs',
        }
    }
    
    # Save config
    os.makedirs(os.path.dirname(config_path), exist_ok=True)
    with open(config_path, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)
    print(f"‚úÖ Created default config at {config_path}")
else:
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    print(f"‚úÖ Loaded config from {config_path}")

print("\nTraining Configuration:")
print(yaml.dump(config, default_flow_style=False))

## 5. Train Model

In [None]:
# Import training modules
# Note: This will work once the training module is implemented
# For now, this is a template structure

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
import torch

print("‚úÖ Training modules imported!")

In [None]:
# Load tokenizer
tokenizer_name = config['tokenizer']['tokenizer_name']
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"‚úÖ Tokenizer loaded: {tokenizer_name}")
print(f"Vocabulary size: {len(tokenizer)}")

In [None]:
# Load model
model_name = config['model']['architecture']
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if config['training']['fp16'] else torch.float32
)

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f"‚úÖ Model loaded: {model_name}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")
print(f"Device: {device}")

In [None]:
# Load datasets
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=config['data']['max_seq_length'],
        padding='max_length'
    )

# Load JSONL files
train_dataset = load_dataset('json', data_files=config['data']['train_file'], split='train')
val_dataset = load_dataset('json', data_files=config['data']['validation_file'], split='train')

# Tokenize
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

print(f"‚úÖ Datasets loaded!")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")

In [None]:
# Setup data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM, not masked LM
)

# Setup training arguments
training_args = TrainingArguments(
    output_dir=config['training']['output_dir'],
    num_train_epochs=config['training']['num_train_epochs'],
    per_device_train_batch_size=config['training']['per_device_train_batch_size'],
    per_device_eval_batch_size=config['training']['per_device_eval_batch_size'],
    gradient_accumulation_steps=config['training']['gradient_accumulation_steps'],
    learning_rate=config['training']['learning_rate'],
    warmup_steps=config['training']['warmup_steps'],
    logging_steps=config['training']['logging_steps'],
    save_steps=config['training']['save_steps'],
    eval_steps=config['training']['eval_steps'],
    save_total_limit=config['training']['save_total_limit'],
    fp16=config['training']['fp16'],
    logging_dir=config['logging']['logging_dir'],
    evaluation_strategy='steps',
    save_strategy='steps',
    load_best_model_at_end=True,
    report_to='tensorboard' if config['logging']['logger'] == 'tensorboard' else None,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("‚úÖ Trainer setup complete!")

In [None]:
# Start training!
print("üöÄ Starting training...")
print(f"Checkpoints will be saved to: {config['training']['output_dir']}")
print(f"TensorBoard logs: {config['logging']['logging_dir']}")

trainer.train()

print("\n‚úÖ Training complete!")

In [None]:
# Save final model
final_model_dir = os.path.join(config['training']['output_dir'], 'final_model')
trainer.save_model(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

print(f"‚úÖ Final model saved to: {final_model_dir}")

## 6. Monitor Training (TensorBoard)

In [None]:
# Load TensorBoard extension
%load_ext tensorboard

# Start TensorBoard
%tensorboard --logdir {config['logging']['logging_dir']} --port 6006

## 7. Download Checkpoints (Optional)

In [None]:
# If checkpoints are saved to Colab storage (not Drive), download them
# This creates a zip file you can download

import shutil

checkpoint_dir = config['training']['output_dir']
if os.path.exists(checkpoint_dir) and not checkpoint_dir.startswith('/content/drive'):
    zip_path = '/content/hamletmachine_checkpoints.zip'
    shutil.make_archive(
        zip_path.replace('.zip', ''),
        'zip',
        checkpoint_dir
    )
    print(f"‚úÖ Checkpoints zipped: {zip_path}")
    print("Download from: Files ‚Üí hamletmachine_checkpoints.zip")
else:
    print("‚úÖ Checkpoints are saved to Google Drive - no download needed!")