# üêâ Chimera Medium (350M) - Colab Training

**IMPORTANT: Run cells in order! Don't skip any.**

1. Runtime ‚Üí Change runtime type ‚Üí **T4 GPU**
2. Run ALL cells from top to bottom

In [None]:
#@title 1. Setup - Check GPU & Install Dependencies { display-mode: "form" }

import subprocess
import sys

# Check GPU
print("=" * 50)
print("CHECKING GPU...")
print("=" * 50)
!nvidia-smi --query-gpu=name,memory.total --format=csv

import torch
if not torch.cuda.is_available():
    print("\n‚ùå ERROR: No GPU detected!")
    print("Go to Runtime ‚Üí Change runtime type ‚Üí T4 GPU")
    raise SystemExit("No GPU")
else:
    print(f"\n‚úÖ GPU: {torch.cuda.get_device_name()}")
    print(f"‚úÖ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Install dependencies
print("\n" + "=" * 50)
print("INSTALLING DEPENDENCIES...")
print("=" * 50)
!pip install -q transformers datasets sentencepiece
print("‚úÖ Dependencies installed!")

In [None]:
#@title 2. Upload Code - Upload chimera_code.zip { display-mode: "form" }

from google.colab import files
import zipfile
import os

# Create directories
os.makedirs('/content/chimera/data', exist_ok=True)
os.makedirs('/content/chimera/checkpoints', exist_ok=True)

print("=" * 50)
print("UPLOAD chimera_code.zip (44KB)")
print("=" * 50)

uploaded = files.upload()

if not uploaded:
    print("‚ùå No file uploaded!")
    raise SystemExit("Upload failed")

for filename in uploaded.keys():
    print(f"Extracting {filename}...")
    with zipfile.ZipFile(filename, 'r') as z:
        z.extractall('/content/chimera')

%cd /content/chimera
print("\n‚úÖ Code uploaded! Files:")
!ls -la *.py

In [None]:
#@title 3. Download ALL Training Data { display-mode: "form" }

import os
os.chdir('/content/chimera')

print("=" * 50)
print("DOWNLOADING TINYSTORIES (this takes a few minutes)")
print("=" * 50)

from datasets import load_dataset

# Download TinyStories
print("Downloading from HuggingFace...")
ds = load_dataset("roneneldan/TinyStories", split="train")
print(f"Loaded {len(ds):,} stories")

# Save to text file
print("\nSaving to data/tinystories.txt...")
with open('data/tinystories.txt', 'w', encoding='utf-8') as f:
    for i, item in enumerate(ds):
        f.write(item['text'].strip() + '\n\n')
        if (i + 1) % 500000 == 0:
            print(f"  Written {i+1:,} stories...")

print("\n‚úÖ TinyStories downloaded!")
!ls -lh data/tinystories.txt

# Download OASST for fine-tuning later
print("\n" + "=" * 50)
print("DOWNLOADING OASST CONVERSATIONS")
print("=" * 50)

!python -u download_oasst.py --output data/oasst_data.jsonl --max-examples 10000

# Verify all data exists
print("\n" + "=" * 50)
print("VERIFYING DATA")
print("=" * 50)

if os.path.exists('data/tinystories.txt'):
    size = os.path.getsize('data/tinystories.txt') / (1024*1024)
    print(f"‚úÖ tinystories.txt: {size:.1f} MB")
else:
    print("‚ùå tinystories.txt MISSING!")
    raise SystemExit("Data missing")

if os.path.exists('data/oasst_data.jsonl'):
    size = os.path.getsize('data/oasst_data.jsonl') / (1024*1024)
    print(f"‚úÖ oasst_data.jsonl: {size:.1f} MB")
else:
    print("‚ö†Ô∏è oasst_data.jsonl missing (fine-tuning won't work)")

print("\n‚úÖ ALL DATA READY!")

In [None]:
#@title 4. Verify Model Can Load { display-mode: "form" }

import os
os.chdir('/content/chimera')

print("=" * 50)
print("TESTING MODEL")
print("=" * 50)

from model import Chimera, chimera_medium
from tokenizer import ChimeraTokenizer
import torch

config = chimera_medium()
tokenizer = ChimeraTokenizer()
config.vocab_size = tokenizer.vocab_size

model = Chimera(config)
params = model.get_num_params()

print(f"\n‚úÖ Chimera Medium")
print(f"   Parameters: {params:,} ({params/1e6:.0f}M)")
print(f"   d_model: {config.d_model}")
print(f"   layers: {config.n_layers}")
print(f"   vocab: {config.vocab_size:,}")

# Clean up
del model
import gc; gc.collect()
torch.cuda.empty_cache()

print("\n‚úÖ MODEL READY!")

In [None]:
#@title 5. üöÄ START PRETRAINING (45-60 min on A100) { display-mode: "form" }

import os
os.chdir('/content/chimera')

# Final check before training
if not os.path.exists('data/tinystories.txt'):
    print("‚ùå ERROR: Training data not found!")
    print("Run cell 3 first to download data.")
    raise SystemExit("No data")

print("=" * 50)
print("STARTING PRETRAINING")
print("Model: Chimera Medium (350M params)")
print("Target: Loss < 2.0, PPL < 7")
print("Time: ~45-60 min on A100")
print("=" * 50)
print("\nTip: Save checkpoints to Drive periodically!\n")

!python -u train_packed.py \
    --model-config medium \
    --max-steps 3000 \
    --micro-batch-size 16 \
    --gradient-accumulation-steps 2 \
    --lr 3e-4 \
    --save-every 500 \
    --compile

In [None]:
#@title 6. Extract Model Weights { display-mode: "form" }

import os
os.chdir('/content/chimera')
import torch

print("=" * 50)
print("EXTRACTING MODEL WEIGHTS")
print("=" * 50)

# Find latest checkpoint
if os.path.exists('checkpoints/latest.pt'):
    ckpt_path = 'checkpoints/latest.pt'
else:
    # Find any checkpoint
    ckpts = [f for f in os.listdir('checkpoints') if f.startswith('step_')]
    if ckpts:
        ckpt_path = f'checkpoints/{sorted(ckpts)[-1]}'
    else:
        print("‚ùå No checkpoints found! Run pretraining first.")
        raise SystemExit("No checkpoint")

print(f"Loading {ckpt_path}...")
ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)

print(f"Step: {ckpt.get('step', '?')}")
print(f"Loss: {ckpt.get('loss', '?')}")

# Save just model weights
torch.save(ckpt['model'], 'checkpoints/medium_pretrained.pt')
print("\n‚úÖ Saved to checkpoints/medium_pretrained.pt")
!ls -lh checkpoints/medium_pretrained.pt

In [None]:
#@title 7. Test Generation { display-mode: "form" }

import os
os.chdir('/content/chimera')

print("=" * 50)
print("TESTING GENERATION")
print("=" * 50)

!python generate.py \
    --checkpoint checkpoints/medium_pretrained.pt \
    --model-config medium \
    --prompt "Once upon a time, there was a little rabbit" \
    --max-tokens 150

In [None]:
#@title 8. Fine-tune for Chat (15-20 min on A100) { display-mode: "form" }

import os
os.chdir('/content/chimera')

print("=" * 50)
print("GENERATING CONVERSATION DATASET")
print("=" * 50)

!python -u create_instruct_data.py \
    --input data/tinystories.txt \
    --output data/instruct_data.jsonl \
    --max-stories 5000 \
    --external data/oasst_data.jsonl \
    --external-format oasst \
    --max-external 10000

print("\n" + "=" * 50)
print("FINE-TUNING FOR CONVERSATION")
print("=" * 50)

!python -u train_instruct.py \
    --model-path checkpoints/medium_pretrained.pt \
    --model-config medium \
    --data-path data/instruct_data.jsonl \
    --batch-size 8 \
    --gradient-accumulation-steps 2 \
    --epochs 2 \
    --compile

In [None]:
#@title 9. Test Chat { display-mode: "form" }

import os
os.chdir('/content/chimera')

print("Testing conversational responses...\n")

!python generate.py \
    --checkpoint checkpoints/instruct_final.pt \
    --model-config medium \
    --prompt "Hello! What is your name?" \
    --max-tokens 100

print("\n" + "-"*40 + "\n")

!python generate.py \
    --checkpoint checkpoints/instruct_final.pt \
    --model-config medium \
    --prompt "Tell me a short story about a brave dog." \
    --max-tokens 150

In [None]:
#@title 10. Save to Google Drive { display-mode: "form" }

from google.colab import drive
import shutil
import os

os.chdir('/content/chimera')

print("=" * 50)
print("SAVING TO GOOGLE DRIVE")
print("=" * 50)

drive.mount('/content/drive')

# Create chimera folder in Drive
drive_path = '/content/drive/MyDrive/chimera_models'
os.makedirs(drive_path, exist_ok=True)

# Copy models
files_to_save = [
    'checkpoints/medium_pretrained.pt',
    'checkpoints/instruct_final.pt',
    'checkpoints/instruct_best.pt',
    'checkpoints/latest.pt'
]

for f in files_to_save:
    if os.path.exists(f):
        print(f"Copying {f}...")
        shutil.copy(f, drive_path)
        print(f"  ‚úÖ Saved!")
    else:
        print(f"  ‚ö†Ô∏è {f} not found, skipping")

print("\n‚úÖ MODELS SAVED TO GOOGLE DRIVE!")
print(f"Location: {drive_path}")
!ls -lh {drive_path}

In [None]:
#@title 11. Direct Download (Alternative) { display-mode: "form" }

from google.colab import files
import os
os.chdir('/content/chimera')

print("Downloading models directly to your computer...")
print("(This may be slow for large files)\n")

if os.path.exists('checkpoints/medium_pretrained.pt'):
    print("Downloading medium_pretrained.pt...")
    files.download('checkpoints/medium_pretrained.pt')

if os.path.exists('checkpoints/instruct_final.pt'):
    print("Downloading instruct_final.pt...")
    files.download('checkpoints/instruct_final.pt')

---

## ‚ö†Ô∏è Resume Training (if disconnected)

If Colab disconnects:
1. Re-run cells 1-4
2. Run the resume cell below

In [None]:
#@title Resume from Google Drive Checkpoint { display-mode: "form" }

import os
os.chdir('/content/chimera')

from google.colab import drive
drive.mount('/content/drive')

# Copy checkpoint from Drive
drive_ckpt = '/content/drive/MyDrive/chimera_models/latest.pt'
if os.path.exists(drive_ckpt):
    print("Copying checkpoint from Drive...")
    !cp "{drive_ckpt}" checkpoints/latest.pt
    print("‚úÖ Checkpoint restored!")
else:
    print("‚ùå No checkpoint found in Drive")
    print("Starting fresh...")

# Resume training (A100 settings)
if os.path.exists('checkpoints/latest.pt'):
    print("\nResuming training...")
    !python -u train_packed.py \
        --model-config medium \
        --resume checkpoints/latest.pt \
        --max-steps 3000 \
        --micro-batch-size 16 \
        --gradient-accumulation-steps 2 \
        --save-every 500 \
        --compile
else:
    print("No checkpoint to resume from. Run cell 5 to start fresh.")