# MLang V6.1 Training - Google Colab

This notebook trains a model on MLang V6.1 data (lookahead-based prediction).

## Prerequisites

1. Generate training data locally using:
   ```bash
   python encode_mlang_v6_full.py --input-dir ./data --output-txt mlang_v6/train.txt --output-meta mlang_v6/meta.json --seq-len 12 --lookahead 4 --feature-level top1
   ```

2. Convert to ChatML format:
   ```bash
   python convert_mlang_v6_to_chatml.py mlang_v6/train.txt mlang_v6/train.jsonl
   python convert_mlang_v6_to_chatml.py mlang_v6/val.txt mlang_v6/val.jsonl
   ```

3. Upload `train.jsonl` and `val.jsonl` to Colab

## V6.1 Format
- Input: `seq_len` context candles (without OUTCOME)
- Output: `lookahead` OUTCOME tokens (predictions for next N candles)

## 1. Install Dependencies

In [None]:
!pip install -q torch transformers datasets accelerate peft trl bitsandbytes
!pip install -q sentencepiece protobuf
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import login

# Option 1: Enter your token when prompted (interactive)
# login()

# Option 2: Paste your token directly (replace YOUR_TOKEN_HERE below)
login(token="YOUR_TOKEN_HERE")

## 2. HuggingFace Authentication

Required for gated models like Llama 3.2. 

Get your token at: https://huggingface.co/settings/tokens

You also need to accept the Llama 3.2 license at: https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct

## 2. Upload Training Data

Upload `train.jsonl` and `val.jsonl` files using the file browser on the left.

In [None]:
# Verify uploaded files
import os

train_file = "/content/drive/Shareddrives/D1/Data/train.jsonl"
val_file = "/content/drive/Shareddrives/D1/Data/val.jsonl"

if os.path.exists(train_file):
    with open(train_file) as f:
        train_lines = sum(1 for _ in f)
    print(f"✓ Train file: {train_lines} sequences")
else:
    print("✗ Train file not found! Please upload train.jsonl")

if os.path.exists(val_file):
    with open(val_file) as f:
        val_lines = sum(1 for _ in f)
    print(f"✓ Val file: {val_lines} sequences")
else:
    print("✗ Val file not found! Please upload val.jsonl")

## 3. Configuration

In [None]:
# Model Configuration
BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"  # or "google/gemma-3-270m-it" for faster training

# Training Configuration
NUM_EPOCHS = 3
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
LEARNING_RATE = 1e-4
MAX_SEQ_LENGTH = 2048  # Reduce if OOM

# LoRA Configuration
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

# Output
OUTPUT_DIR = "/content/mlang_v6_model"

print("Configuration:")
print(f"  Base model: {BASE_MODEL}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch size: {BATCH_SIZE} × {GRADIENT_ACCUMULATION_STEPS} = {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS} effective")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Max seq length: {MAX_SEQ_LENGTH}")
print(f"  LoRA r: {LORA_R}, alpha: {LORA_ALPHA}")

## 4. Training Script

In [None]:
import os
import json
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainerCallback


class PredictionCallback(TrainerCallback):
    """Callback to show sample predictions during training."""
    
    def __init__(self, tokenizer, val_dataset, num_samples=2, max_new_tokens=50):
        self.tokenizer = tokenizer
        self.val_dataset = val_dataset
        self.num_samples = num_samples
        self.max_new_tokens = max_new_tokens
    
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0 and state.global_step > 0:
            model = kwargs.get('model')
            if model is None:
                return
            
            print(f"\n{'='*60}")
            print(f"Step {state.global_step} - Sample Predictions:")
            
            for i in range(min(self.num_samples, len(self.val_dataset))):
                sample = self.val_dataset[i]
                text = sample['text']
                
                # Find the assistant prompt
                before_assistant = text.split('<|im_start|>assistant')[0]
                prompt = before_assistant + '<|im_start|>assistant\n'
                
                # Expected output
                if '<|im_start|>assistant' in text:
                    expected = text.split('<|im_start|>assistant')[1].split('<|im_end|>')[0].strip()
                else:
                    expected = "N/A"
                
                inputs = self.tokenizer.encode(prompt, return_tensors='pt')
                if inputs.shape[1] > args.max_seq_length - self.max_new_tokens:
                    inputs = inputs[:, -(args.max_seq_length - self.max_new_tokens):]
                
                inputs = inputs.to(model.device)
                
                with torch.no_grad():
                    outputs = model.generate(
                        inputs,
                        max_new_tokens=self.max_new_tokens,
                        do_sample=False,
                        pad_token_id=self.tokenizer.pad_token_id,
                        eos_token_id=self.tokenizer.eos_token_id
                    )
                
                predicted = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
                
                # Calculate match percentage
                expected_tokens = expected.split()
                predicted_tokens = predicted.split()
                matches = sum(1 for e, p in zip(expected_tokens, predicted_tokens) if e == p)
                match_pct = (matches / len(expected_tokens) * 100) if expected_tokens else 0
                
                print(f"  Sample {i+1}:")
                print(f"    Expected: {expected[:100]}...")
                print(f"    Predict: {predicted[:100]}...")
                print(f"    Match: {match_pct:.1f}%")
            print(f"{'='*60}\n")


def load_jsonl(file_path):
    """Load JSONL file into list of dicts."""
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data


def main():
    # Load data
    print("Loading training data...")
    train_data = load_jsonl(train_file)
    val_data = load_jsonl(val_file)
    print(f"  Train: {len(train_data)} sequences")
    print(f"  Val: {len(val_data)} sequences")
    
    # Create datasets
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)
    
    # Load tokenizer
    print(f"\nLoading tokenizer: {BASE_MODEL}")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
    
    # Load model
    print(f"Loading model: {BASE_MODEL}")
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    model.config.use_cache = False
    
    # Configure LoRA
    print("\nConfiguring LoRA...")
    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False
    )
    model = get_peft_model(model, lora_config)
    print(f"  Trainable params: {model.get_nb_trainable_parameters()}")
    
    # Tokenize function with proper label masking
    def tokenize_function(examples):
        """Tokenize and create labels that only compute loss on assistant responses."""
        # Tokenize the full text
        full_tokens = tokenizer(
            examples['text'],
            max_length=MAX_SEQ_LENGTH,
            truncation=True,
            padding=False,
            return_tensors=None
        )

        # For each example, mask the instruction part (system + user)
        # Only compute loss on the assistant's response
        labels_list = []
        for i, text in enumerate(examples['text']):
            input_ids = full_tokens['input_ids'][i]

            # Find where the assistant response starts
            if '<|im_start|>assistant' in text:
                # Get the instruction part (everything before assistant response)
                before_assistant = text.split('<|im_start|>assistant')[0] + '<|im_start|>assistant\n'

                # Tokenize just the instruction part to get its length
                instruction_tokens = tokenizer(
                    before_assistant,
                    max_length=MAX_SEQ_LENGTH,
                    truncation=True,
                    padding=False,
                    add_special_tokens=False  # Don't add special tokens again
                )

                instruction_length = len(instruction_tokens['input_ids'])

                # Create labels: -100 for instruction tokens, actual IDs for response
                example_labels = [-100] * instruction_length + input_ids[instruction_length:]

                # Ensure labels match input_ids length
                example_labels = example_labels[:len(input_ids)]
            else:
                # If no assistant tag found (shouldn't happen), mask everything
                print(f"Warning: No assistant tag found in example {i}")
                example_labels = [-100] * len(input_ids)

            labels_list.append(example_labels)

        full_tokens['labels'] = labels_list
        return full_tokens
    
    # Tokenize datasets
    print("\nTokenizing datasets...")
    train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
    val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
    
    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
        pad_to_multiple_of=8
    )
    
    # Training arguments (removed evaluation_strategy for compatibility)
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=10,
        save_steps=100,
        save_total_limit=2,
        report_to="none",
        save_strategy="steps",
        max_grad_norm=1.0
    )
    
    # Create trainer with prediction callback
    prediction_callback = PredictionCallback(tokenizer, val_dataset, num_samples=2)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        callbacks=[prediction_callback]
    )
    
    # Train
    print("\n" + "="*60)
    print("Starting training...")
    print("="*60 + "\n")
    
    trainer.train()
    
    # Save final model
    print("\nSaving final model...")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"Model saved to: {OUTPUT_DIR}")
    
    return trainer

# Run training
trainer = main()

## 5. Download Model

After training, download the model files from the output folder.

In [None]:
# List model files
import os

for root, dirs, files in os.walk(OUTPUT_DIR):
    for file in files:
        filepath = os.path.join(root, file)
        size = os.path.getsize(filepath) / 1024 / 1024  # MB
        print(f"{file}: {size:.1f} MB")

In [None]:
# Create a zip file for easy download
!zip -r /content/mlang_v6_model.zip /content/mlang_v6_model
print("\nDownload mlang_v6_model.zip from the file browser")