In [None]:
# Cell 1: Setup & Upload Files
!pip install tokenizers -q

from google.colab import files
import os

print("üì§ Upload these files from your local machine:")
print("  1. data/igala_corpus.txt")
print("  2. outputs/tokenizer/igala_tokenizer.json")
print("  3. scripts/model.py")
print("\nClick 'Choose Files' below:")

uploaded = files.upload()

# Organize files
os.makedirs('data', exist_ok=True)
os.makedirs('outputs/tokenizer', exist_ok=True)
os.makedirs('scripts', exist_ok=True)

for filename in uploaded.keys():
    if 'corpus' in filename:
        !mv {filename} data/igala_corpus.txt
    elif 'tokenizer' in filename:
        !mv {filename} outputs/tokenizer/igala_tokenizer.json
    elif 'model.py' in filename:
        !mv {filename} scripts/model.py

print("‚úÖ Files uploaded!")

# Cell 2: Training Script
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
import json
from tqdm import tqdm
import sys
sys.path.append('scripts')
from model import IgalaGPT, GPTConfig

# Load tokenizer
tokenizer = Tokenizer.from_file("outputs/tokenizer/igala_tokenizer.json")
print(f"üìñ Tokenizer loaded: {tokenizer.get_vocab_size()} tokens")

# Load corpus
with open('data/igala_corpus.txt', 'r', encoding='utf-8') as f:
    corpus = f.read()

print(f"üìö Corpus: {len(corpus)} characters, {len(corpus.split())} words")

# Tokenize entire corpus
tokens = tokenizer.encode(corpus).ids
print(f"üî¢ Total tokens: {len(tokens)}")

# Dataset
class IgalaDataset(Dataset):
    def __init__(self, tokens, block_size):
        self.tokens = tokens
        self.block_size = block_size
    
    def __len__(self):
        return len(self.tokens) - self.block_size
    
    def __getitem__(self, idx):
        chunk = self.tokens[idx:idx + self.block_size + 1]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

# Config
config = GPTConfig()
config.vocab_size = tokenizer.get_vocab_size()

# Create model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"üñ•Ô∏è  Device: {device}")

model = IgalaGPT(config).to(device)
print(f"üß† Model: {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters")

# Training setup
dataset = IgalaDataset(tokens, config.block_size)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(dataloader)*10)

# Training loop
epochs = 10
training_logs = []

print("\nüöÄ Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch_idx, (x, y) in enumerate(pbar):
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        logits, loss = model(x, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(dataloader)
    training_logs.append({'epoch': epoch+1, 'loss': avg_loss})
    print(f"‚úÖ Epoch {epoch+1} | Avg Loss: {avg_loss:.4f}")
    
    # Generate sample
    if (epoch + 1) % 3 == 0:
        model.eval()
        prompt = "<BOS>"
        prompt_tokens = torch.tensor([tokenizer.encode(prompt).ids], dtype=torch.long).to(device)
        generated = model.generate(prompt_tokens, max_new_tokens=30, temperature=0.8, top_k=40)
        generated_text = tokenizer.decode(generated[0].tolist())
        print(f"üéØ Sample generation: {generated_text}\n")

print("‚úÖ Training complete!")

# Cell 3: Save Model
os.makedirs('outputs/model_checkpoints', exist_ok=True)

# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'config': config,
    'training_logs': training_logs
}, 'outputs/model_checkpoints/igala_gpt_final.pt')

# Save logs
with open('outputs/training_logs.json', 'w') as f:
    json.dump(training_logs, f, indent=2)

print("‚úÖ Model saved!")

# Cell 4: Download Trained Model
from google.colab import files

files.download('outputs/model_checkpoints/igala_gpt_final.pt')
files.download('outputs/training_logs.json')

print("‚úÖ Download complete! Transfer these to your local outputs/ folder")
