In [6]:
import torch
from unsloth import FastLanguageModel

In [14]:
from datasets import Dataset

In [7]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = 2048,
    load_in_4bit = True,
)
print("Loaded")

==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.1. vLLM: 0.11.2.
   \\   /|    NVIDIA GeForce RTX 5080 Laptop GPU. Num GPUs = 1. Max memory: 15.92 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Loaded


In [10]:
# LoRA time
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    lora_alpha = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout = 0,
    bias = "none",
)
print("LoRA adapters added")

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2026.1.4 patched 16 layers with 16 QKV layers, 16 O layers and 0 MLP layers.


LoRA adapters added


In [11]:
def count_parameters(model): #simple data, like trainable parameters, total params, %training
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return trainable, total

In [12]:
trainable, total = count_parameters(model)
print(f"Trainable: {trainable:,}")
print(f"Total: {total:,}")
print(f"Percentage: {100 * trainable / total:.2f}%")

Trainable: 3,407,872
Total: 777,848,832
Percentage: 0.44%


In [15]:
import json

In [20]:
#figure out the file structure
import os
print("Workspace contents:")                                                                                                            
print(os.listdir("/workspace"))

Workspace contents:
['.cache', '.gitignore', '.ipynb_checkpoints', '.Trash-0', 'basic_search_examples.json', 'data', 'discovery.ipynb', 'error_recovery_examples.json', 'keyword_retry_examples.json', 'llmquery_examples.json', 'multistep_examples.json', 'outputs', 'supervised-fine-tuning.ipynb', 'unsloth_compiled_cache']


In [23]:
data_path = "/workspace/data/rlm_training_full.json"
with open(data_path, "r") as f:
    training_data = json.load(f)
dataset = Dataset.from_list(training_data)
print(f"loaded {len(dataset)} training examples")
print(f"First example preview:")
for msg in dataset[0]['messages'][:3]:
    role = msg['role']
    content = msg['content'][:80] + "..." if len(msg['content']) > 80 else msg['content']
    print(f" [{role}]: {content}")

loaded 56 training examples
First example preview:
 [user]: You are a SEARCH assistant with a Python REPL. You search documents - nothing el...
 [assistant]: ```python
idx = context.find('speed limit')
print(f"Found at: {idx}")
```
 [user]: Output:
Found at: 2847


In [30]:
  # Step 1: Pre-format all examples                                                                                                  
def format_chat(example):
  return tokenizer.apply_chat_template(                                                                                        
      example["messages"],
      tokenize=False,
  )

# Apply formatting to create "text" column
formatted_dataset = dataset.map(
  lambda x: {"text": format_chat(x)},
  remove_columns=["messages"]  # Remove old column, keep "text"
)

print("Formatted dataset columns:", formatted_dataset.column_names)
print("\nFirst example preview (first 300 chars):")
print(formatted_dataset[0]["text"][:300])


Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Formatted dataset columns: ['text']

First example preview (first 300 chars):
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Feb 2026

<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a SEARCH assistant with a Python REPL. You search documents - nothing else.

OUTPUT FORMAT: Your response must START


In [34]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(                                                                                                                  
    model=model,
    tokenizer=tokenizer,                                                                                                         
    train_dataset=formatted_dataset,
    max_seq_length=2048,
    dataset_text_field = "text",
    args=TrainingArguments(
        output_dir="./rlm_lora_output",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=2e-4,
        logging_steps=1,
        save_strategy="epoch",
        bf16=True,
        report_to="none",
    ),
)
print("done")

Unsloth: Tokenizing ["text"] (num_proc=28):   0%|          | 0/56 [00:00<?, ? examples/s]

done


In [35]:
trainer.train()
print("done")

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 56 | Num Epochs = 3 | Total steps = 21
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 3,407,872 of 1,239,222,272 (0.28% trained)


Step,Training Loss
1,3.2091
2,3.1626
3,3.0696
4,2.9258
5,2.8074
6,2.657
7,2.5517
8,2.5497
9,2.3858
10,2.3047




done


In [38]:
model.eval()

test_messages = [
    {"role": "user", "content": """You are a SEARCH assistant with a Python REPL. You search documents - nothing else.

  OUTPUT FORMAT: Your response must START with a Python code block using ```python

  AVAILABLE:
  - context: the document text (string) - already loaded, do NOT redefine it
  - print(): to output results

  RULES:
  - Search FIRST, answer NEVER until you have evidence
  - Use context.find() or context[start:end] to explore
  - Only give FINAL(answer) when you have proof

  ---

  The context variable already contains: "Acme Corp was established in 1987 by John Smith in Seattle. The company grew rapidly..." 

  Question: What year was the company founded?"""}
]

inputs = tokenizer.apply_chat_template(
    test_messages,
    return_tensors="pt",
    add_generation_prompt = True
).to("cuda")

outputs = model.generate(
    inputs,
    max_new_tokens = 150,
    temperature = .3, # the lower, the more deterministic
    do_sample = True,
)

response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens = True)

print("Done. Response: ")
print(response)

Done. Response: 
```python
from contextlib import contextmanager

@contextmanager
def search_context():
    """Search context variable for search results"""
    try:
        yield
    finally:
        context = yield
        print(context)

# Search context
with search_context() as context:
    print(context)  # "Acme Corp was established in 1987 by John Smith in Seattle. The company grew rapidly..."

# Search for specific text
print(context.find("The company grew rapidly..."))  # Output: "1987"

# Search for a range of text
print(context[start:end])  # Output: "The company grew rapidly... 1987-"

# Search for a specific text in a range
print(context[start:end])


In [40]:
# save progress
model.save_pretrained("./rlm_lora_adapter")
tokenizer.save_pretrained("./rlm_lora_adapter")

print("saved to ./rlm/lora_adapter")

saved to ./rlm/lora_adapter
