# Plan 9 Programming - Multi-Stage Fine-tuning

Three-stage training pipeline for teaching LLMs Plan 9 programming:

| Stage | Dataset | Purpose |
|-------|---------|--------|
| 1. Knowledge | `knowledge.jsonl` | Inject Plan 9 source code patterns |
| 2. SFT | `conversations.jsonl` | Learn multi-turn tool use |
| 3. GRPO | Remote QEMU API | Optimize with execution rewards |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/garutyunov/plan9-dataset/blob/main/notebooks/plan9_sft_colab.ipynb)

## Requirements
- Google Colab with T4 GPU (free tier)
- For GRPO: Remote server running `plan9-dataset serve-qemu`

## 1. Install Dependencies

In [None]:
%%capture
!pip install unsloth
!pip install --no-deps trl peft accelerate bitsandbytes
!pip install datasets huggingface-hub requests

In [None]:
import torch
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 2. Configuration

In [None]:
# Dataset configuration
DATASET_REPO = "garutyunov/plan9-sft"

# Training stages to run
RUN_KNOWLEDGE_STAGE = True   # Stage 1: Continued pretraining on source code
RUN_SFT_STAGE = True         # Stage 2: SFT on multi-turn conversations
RUN_GRPO_STAGE = False       # Stage 3: GRPO with execution rewards (requires server)

# Model settings
MODEL_NAME = "unsloth/gemma-3-1b-it-bnb-4bit"
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True

# LoRA settings
LORA_R = 8
LORA_ALPHA = 16

print("Configuration:")
print(f"  Knowledge stage: {RUN_KNOWLEDGE_STAGE}")
print(f"  SFT stage: {RUN_SFT_STAGE}")
print(f"  GRPO stage: {RUN_GRPO_STAGE}")

## 3. Load Model

In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=LOAD_IN_4BIT,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=LORA_ALPHA,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

model.print_trainable_parameters()

## 4. Load Datasets

In [None]:
from datasets import load_dataset

# Load all datasets
print("Loading datasets...")

# Knowledge dataset (raw source code for continued pretraining)
try:
    knowledge_ds = load_dataset(DATASET_REPO, data_files="knowledge.jsonl")["train"]
    print(f"✓ Knowledge: {len(knowledge_ds)} files")
except:
    knowledge_ds = None
    print("✗ Knowledge dataset not found")

# Conversations dataset (multi-turn SFT)
try:
    conversations_ds = load_dataset(DATASET_REPO, data_files="conversations.jsonl")["train"]
    print(f"✓ Conversations: {len(conversations_ds)} examples")
except:
    conversations_ds = None
    print("✗ Conversations dataset not found")

# Simple SFT dataset (fallback)
try:
    simple_ds = load_dataset(DATASET_REPO, data_files="dataset.jsonl")["train"]
    print(f"✓ Simple SFT: {len(simple_ds)} examples")
except:
    simple_ds = None
    print("✗ Simple SFT dataset not found")

---
# Stage 1: Knowledge Injection (Continued Pretraining)

Train on raw Plan 9 source code to inject programming patterns.

In [None]:
if RUN_KNOWLEDGE_STAGE and knowledge_ds is not None:
    from trl import SFTTrainer
    from transformers import TrainingArguments

    print("=" * 60)
    print("Stage 1: Knowledge Injection")
    print("=" * 60)

    # Format: just raw text (no chat template)
    def format_knowledge(example):
        # Add file type header
        header = f"/* Plan 9 {example['file_type']} - {example['source']} */\n"
        return {"text": header + example["text"]}

    knowledge_formatted = knowledge_ds.map(format_knowledge)

    # Training args for continued pretraining
    knowledge_args = TrainingArguments(
        output_dir="./plan9-knowledge",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=50,
        num_train_epochs=1,
        learning_rate=5e-5,  # Lower LR for pretraining
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        save_strategy="epoch",
        optim="adamw_8bit",
        report_to="none",
    )

    knowledge_trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=knowledge_formatted,
        dataset_text_field="text",
        max_seq_length=MAX_SEQ_LENGTH,
        packing=True,  # Pack short examples for efficiency
        args=knowledge_args,
    )

    print(f"Training on {len(knowledge_formatted)} source files...")
    knowledge_trainer.train()
    print("✓ Stage 1 complete")
else:
    print("Skipping Stage 1 (Knowledge)")

---
# Stage 2: Supervised Fine-Tuning (SFT)

Train on multi-turn conversations with tool calls.

In [None]:
if RUN_SFT_STAGE:
    from trl import SFTTrainer
    from transformers import TrainingArguments

    print("=" * 60)
    print("Stage 2: Supervised Fine-Tuning")
    print("=" * 60)

    # Use conversations if available, else fall back to simple format
    if conversations_ds is not None:
        print("Using multi-turn conversations dataset")

        def format_conversation(example):
            """Format multi-turn conversation for training."""
            turns = example["turns"]
            text_parts = []

            for turn in turns:
                role = turn["role"]
                content = turn["content"]

                if role == "user":
                    text_parts.append(f"<start_of_turn>user\n{content}<end_of_turn>")
                elif role == "model":
                    # Include thinking and tool calls
                    model_text = ""
                    if turn.get("thinking"):
                        model_text += f"<think>\n{turn['thinking']}\n</think>\n"
                    if turn.get("tool_calls"):
                        for tc in turn["tool_calls"]:
                            import json
                            params = json.dumps(tc["params"])
                            model_text += f"<start_function_call>call:{tc['name']}{params}<end_function_call>\n"
                    if content:
                        model_text += content
                    text_parts.append(f"<start_of_turn>model\n{model_text.strip()}<end_of_turn>")
                elif role == "tool":
                    text_parts.append(f"<start_of_turn>user\n{content}<end_of_turn>")

            return {"text": "\n".join(text_parts)}

        sft_dataset = conversations_ds.map(format_conversation)

    elif simple_ds is not None:
        print("Using simple instruction-response dataset")

        def format_simple(example):
            return {
                "text": f"<start_of_turn>user\n{example['instruction']}<end_of_turn>\n<start_of_turn>model\n{example['response']}<end_of_turn>"
            }

        sft_dataset = simple_ds.map(format_simple)
    else:
        raise ValueError("No SFT dataset available!")

    # Training args for SFT
    sft_args = TrainingArguments(
        output_dir="./plan9-sft",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        save_strategy="epoch",
        optim="adamw_8bit",
        report_to="none",
    )

    sft_trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=sft_dataset,
        dataset_text_field="text",
        max_seq_length=MAX_SEQ_LENGTH,
        packing=False,
        args=sft_args,
    )

    print(f"Training on {len(sft_dataset)} examples...")
    sft_trainer.train()
    print("✓ Stage 2 complete")
else:
    print("Skipping Stage 2 (SFT)")

---
# Stage 3: GRPO with Remote Execution Rewards

Optimize using real Plan 9 execution feedback from remote QEMU server.

## Setup Instructions

1. **On your server with QEMU:**
   ```bash
   pip install 'plan9-dataset[server]'
   plan9-dataset serve-qemu --generate-token
   plan9-dataset serve-qemu --token YOUR_TOKEN --port 8080
   ```

2. **In Colab Secrets** (key icon in sidebar):
   - `QEMU_SERVER_URL`: e.g., `https://your-server.com:8080`
   - `QEMU_TOKEN`: Token from step 1

In [None]:
# QEMU API Client
import requests
import json

class QEMUClient:
    """Client for remote Plan 9 QEMU API."""

    def __init__(self, server_url: str, token: str):
        self.server_url = server_url.rstrip("/")
        self.session = requests.Session()
        self.session.headers.update({
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json",
        })

    def health(self) -> dict:
        """Check server health."""
        r = self.session.get(f"{self.server_url}/health", timeout=10)
        r.raise_for_status()
        return r.json()

    def execute(self, tool: str, params: dict) -> dict:
        """Execute a tool on the remote VM."""
        r = self.session.post(
            f"{self.server_url}/execute",
            json={"tool": tool, "params": params},
            timeout=60,
        )
        r.raise_for_status()
        return r.json()

    def write_file(self, path: str, content: str) -> dict:
        """Write a file on the remote VM."""
        return self.execute("write_file", {"path": path, "content": content})

    def read_file(self, path: str) -> dict:
        """Read a file from the remote VM."""
        return self.execute("read_file", {"path": path})

    def run_command(self, command: str) -> dict:
        """Run a command on the remote VM."""
        return self.execute("run_command", {"command": command})

    def reset(self) -> dict:
        """Reset VM state."""
        r = self.session.post(f"{self.server_url}/reset", timeout=30)
        r.raise_for_status()
        return r.json()

    def compute_reward(self, model_output: str, expected_output: str = None) -> dict:
        """Compute reward for model output using VM execution."""
        r = self.session.post(
            f"{self.server_url}/reward",
            json={"model_output": model_output, "expected_output": expected_output},
            timeout=120,
        )
        r.raise_for_status()
        return r.json()

print("QEMUClient defined")

In [None]:
# Check GRPO configuration
GRPO_READY = False

if RUN_GRPO_STAGE:
    try:
        from google.colab import userdata
        QEMU_SERVER_URL = userdata.get('QEMU_SERVER_URL')
        QEMU_TOKEN = userdata.get('QEMU_TOKEN')

        if QEMU_SERVER_URL and QEMU_TOKEN:
            client = QEMUClient(QEMU_SERVER_URL, QEMU_TOKEN)
            health = client.health()
            print(f"✓ Connected to QEMU server")
            print(f"  URL: {QEMU_SERVER_URL}")
            print(f"  VM running: {health.get('vm_running')}")
            GRPO_READY = True
        else:
            print("✗ QEMU secrets not configured")
            print("  Add QEMU_SERVER_URL and QEMU_TOKEN to Colab secrets")
    except Exception as e:
        print(f"✗ GRPO setup failed: {e}")
else:
    print("GRPO stage disabled")

In [None]:
if GRPO_READY:
    print("=" * 60)
    print("Stage 3: GRPO with Execution Rewards")
    print("=" * 60)

    # Load GRPO prompts
    grpo_prompts = [
        "Write a Plan 9 C program that prints 'Hello, Plan 9!'",
        "Write a Plan 9 C program that reads /dev/user and prints the username",
        "Write an rc script that counts .c files in the current directory",
        "Write a Plan 9 C program that calculates factorial of 5",
        "Write a Plan 9 C program using channels to send integers between threads",
    ]

    def reward_function(samples, prompts, outputs, **kwargs):
        """Compute rewards using remote QEMU execution."""
        rewards = []
        for i, output in enumerate(outputs):
            try:
                result = client.compute_reward(output)
                reward = result.get("total", 0.0)
                print(f"  Sample {i}: reward={reward:.2f}")
                rewards.append(reward)
                client.reset()
            except Exception as e:
                print(f"  Sample {i}: error - {e}")
                rewards.append(0.0)
        return rewards

    # Create dataset from prompts
    from datasets import Dataset

    grpo_dataset = Dataset.from_dict({
        "prompt": [f"<start_of_turn>user\n{p}<end_of_turn>\n<start_of_turn>model\n" for p in grpo_prompts]
    })

    print(f"GRPO dataset: {len(grpo_dataset)} prompts")

    # GRPO training
    from trl import GRPOConfig, GRPOTrainer

    grpo_config = GRPOConfig(
        output_dir="./plan9-grpo",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=2,
        num_train_epochs=2,
        learning_rate=1e-5,
        logging_steps=1,
        num_generations=2,
        temperature=0.8,
        max_new_tokens=512,
        report_to="none",
    )

    grpo_trainer = GRPOTrainer(
        model=model,
        config=grpo_config,
        tokenizer=tokenizer,
        train_dataset=grpo_dataset,
        reward_funcs=[reward_function],
    )

    print("Starting GRPO training...")
    grpo_trainer.train()
    print("✓ Stage 3 complete")
else:
    print("Skipping Stage 3 (GRPO)")

---
# Test Inference

In [None]:
# Enable inference mode
FastLanguageModel.for_inference(model)

test_prompts = [
    "Write a Plan 9 C program that prints 'Hello, Plan 9!'",
    "Write an rc script that lists all .c files",
    "How do I read a file using Bio in Plan 9 C?",
]

for prompt in test_prompts:
    print(f"\n{'='*60}")
    print(f"Prompt: {prompt}")
    print(f"{'='*60}")

    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "model" in response:
        response = response.split("model")[-1].strip()
    print(response[:1000])

---
# Save Model

In [None]:
# Save locally
model.save_pretrained("plan9-gemma-lora")
tokenizer.save_pretrained("plan9-gemma-lora")
print("✓ Saved to plan9-gemma-lora/")

In [None]:
# Optional: Push to HuggingFace Hub
PUSH_TO_HUB = False
HUB_REPO = "YOUR_USERNAME/plan9-gemma-lora"

if PUSH_TO_HUB:
    from huggingface_hub import login
    login()
    model.push_to_hub(HUB_REPO)
    tokenizer.push_to_hub(HUB_REPO)
    print(f"✓ Pushed to {HUB_REPO}")

---
# Resources

- [Plan 9 Dataset](https://huggingface.co/datasets/garutyunov/plan9-sft)
- [9ml Project](https://github.com/garutyunov/9ml)
- [Unsloth](https://github.com/unslothai/unsloth)
- [TRL Documentation](https://huggingface.co/docs/trl)