### Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True, # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.85, # Reduce if out of memory
    # token = "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",      # use one if using gated models
)
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_rank,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
import json, re
def read_jsonfile(file_path="/home/ducanh/data/nemoWS/datasets/hey2/ielts-marking/raw_input4unsloth.json"):
    all_samples = []
    # Open and read the file line by line
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data = json.loads(line)  # Parse the JSON object from each line
            # print(data)              # Do something with the JSON object
            all_samples += [data]
    return all_samples
def extract_hash_answer(text:str):
    if "boxed{" in text:
        pattern = r"\\boxed{(.+)}"
        matches = re.findall(pattern, text)
        if len(matches) == 0: print(text)
        text = matches[0]
        return text
    newtext=""
    for i in text:
        if i.isdigit(): newtext += i
    if newtext.strip() == "":
        print("Error-->", text)
        return None
    return newtext.strip()

import re
reasoning_start = "<think>"
reasoning_end   = "</think>"
solution_start = "<CONCLUSION>"
solution_end = "</CONCLUSION>"

system_prompt = \
f"""You are a IELTS Speaking Examiner.
Analysis question and corresponding student response then provide your comment.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide the score from 0-9 between {solution_start}{solution_end}"""

def generate_conversation(examples):
    problems  = [example["input"].strip() for example in examples]
    solutions = [example["output"].strip() for example in examples]
    points = [extract_hash_answer(example["output"].strip()) for example in examples]
    conversations = []
    for problem, solution, point in zip(problems, solutions, points):
        if point is None:
            # print("\n--> SKIP output:", problem, solution)
            continue
        x = {
            "prompt" : [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": problem},
            ],
            "answer": point, 'question': problem
        }
        conversations.append(x)
    return conversations
        # conversations.append([
        #     {"role" : "user",      "content" : problem},
        #     {"role" : "assistant", "content" : solution},
        # ])
    return { "conversations": conversations, }

ielts_datsets = read_jsonfile()
# my_reasoning_conversations = tokenizer.apply_chat_template(
#     generate_conversation(ielts_datsets)["conversations"],
#     tokenize = False,
# )
my_reasoning_conversations = generate_conversation(ielts_datsets)
print(my_reasoning_conversations[0])

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [None]:
match_format = re.compile(
    rf"^[\s]{{0,}}"\
    rf"{reasoning_start}.+?{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)

match_format.search(
    "<start_working_out>Let me think!<end_working_out>"\
    "<SOLUTION>2</SOLUTION>",
)

In [None]:
match_format.search(
    "<think>Let me think!</think>"\
    "<SOLUTION>2</SOLUTION>",
)

In [None]:
def match_format_exactly(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Match if format is seen exactly!
        if match_format.search(response) is not None: score += 3.0
        scores.append(score)
    return scores

def match_format_approximately(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Count how many keywords are seen - we penalize if too many!
        # If we see 1, then plus some points!
        score += 0.5 if response.count(reasoning_start) == 1 else -1.0
        score += 0.5 if response.count(reasoning_end)   == 1 else -1.0
        score += 0.5 if response.count(solution_start)  == 1 else -1.0
        score += 0.5 if response.count(solution_end)    == 1 else -1.0
        scores.append(score)
    return scores

def check_answer(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_format.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0
        if guess is None:
            scores.append(0)
            continue
        # Correct answer gets 3 points!
        if guess == true_answer:
            score += 3.0
        # Match if spaces are seen, but less reward
        elif guess.strip() == true_answer.strip():
            score += 1.5
        else:
            # We also reward it if the answer is close via ratios!
            # Ie if the answer is within some range, reward it!
            try:
                ratio = float(guess) / float(true_answer)
                if   ratio >= 0.9 and ratio <= 1.1: score += 1.0
                elif ratio >= 0.8 and ratio <= 1.2: score += 0.5
                else: score -= 1.5 # Penalize wrong answers
            except:
                score -= 1.5 # Penalize
        scores.append(score)
    return scores

In [None]:
import re
global PRINTED_TIMES
PRINTED_TIMES = 0
global PRINT_EVERY_STEPS
PRINT_EVERY_STEPS = 5
match_numbers = re.compile(
    solution_start + r".*?([\d\.\,]{1,})",
    flags = re.MULTILINE | re.DOTALL
)
def check_numbers(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    # Print only every few steps
    global PRINTED_TIMES
    global PRINT_EVERY_STEPS
    if PRINTED_TIMES % PRINT_EVERY_STEPS == 0:
        print('*'*20, f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    PRINTED_TIMES += 1

    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(0)
            continue
        # Convert to numbers
        try:
            true_answer = float(true_answer.strip())
            # Remove commas like in 123,456
            guess       = float(guess.strip().replace(",", ""))
            scores.append(1.5 if guess == true_answer else -0.5)
        except:
            scores.append(0)
            continue
    return scores

In [15]:
max_prompt_length = 400 + 1 # + 1 just in case!
max_seq_length=2048
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    learning_rate = 5e-6,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_torch_fused",
    logging_steps = 1,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_seq_length - max_prompt_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 2000,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/qrpo/outputs",
)
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
    ],
    args = training_args,
    train_dataset = my_reasoning_conversations,
)
trainer.train()

******************** Question:
Evaluate the following answer(s) for IELTS speaking test for the rubric "Grammatical Range and Accuracy".

Items to be evaluated:
IELTS speaking test part: Part 3
Topic: Culinary Traditions and Identity
Questions:
How do culinary traditions contribute to cultural identity?
What role does food play in bringing people together during celebrations?
Can culinary traditions evolve while still preserving their essence?
How has globalization affected local culinary traditions around the world?

Answers:
Culinary traditions contribute to cultural identity by reflecting the unique ingredients, cooking methods, and flavors that are passed down through generations, helping people feel connected to their heritage.
Food plays a central role in bringing people together during celebrations as sharing meals fosters a sense of community, strengthens relationships, and creates joyful experiences.
Yes, culinary traditions can evolve by incorporating new influences or adapti

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers
1,0.0,3.125,0.727011,766.8125,0.0,2.25,1.25,-0.40625,0.03125
2,-0.0,2.140625,0.275649,733.90625,0.0,2.25,1.25,-1.0,-0.359375
3,0.0,3.34375,0.273453,732.4375,0.000446,2.25,1.25,-0.296875,0.140625
4,0.0,2.765625,0.52017,773.65625,0.000494,2.25,1.25,-0.375,-0.359375
5,0.0,4.484375,1.257046,660.5625,0.000427,3.0,1.90625,-0.3125,-0.109375
6,0.0,4.34375,0.653566,536.25,0.000492,3.0,2.0,-0.3125,-0.34375
7,0.0,4.296875,0.787711,563.8125,0.000492,3.0,2.0,-0.296875,-0.40625
8,0.0,3.453125,0.858363,524.21875,0.000504,3.0,2.0,-1.171875,-0.375
9,0.0,4.140625,0.954251,521.625,0.00054,3.0,2.0,-0.625,-0.234375
10,0.0,3.734375,1.028268,556.40625,0.000484,3.0,2.0,-1.078125,-0.1875


******************** Question:
Evaluate the following answer(s) for IELTS speaking test for the rubric "Grammatical Range and Accuracy".

Items to be evaluated:
IELTS speaking test part: Part 3
Topic: Aging Population
Questions:
What are some challenges faced by an aging population?
How can societies better support their elderly citizens?
Do you think the government should invest more in programs for the elderly? Why or why not?
What role do families play in caring for older relatives?

Answers:
Old people have many problem like health and no work.
Society can help old with hospital and food and home.
Government should give money for old people because they need help.
Family is important for old people. They take care and love. 
Answer:
1 
Response:
<think>
Okay, let me evaluate the student's answers for the "Grammatical Range and Accuracy" rubric in the IELTS Speaking test. The topic is the aging population, and the student has answered four questions. 

First, I'll look at each answe

Traceback (most recent call last):
  File "/home/ducanh/data/miniconda3/envs/unsloth/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/tmp/ipykernel_26156/3917720072.py", line 35, in <module>
    trainer.train()
  File "/home/ducanh/data/miniconda3/envs/unsloth/lib/python3.10/site-packages/transformers/trainer.py", line 2245, in train
    return inner_training_loop(
  File "<string>", line 314, in _fast_inner_training_loop
  File "<string>", line 25, in _unsloth_training_step
  File "/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/qrpo/unsloth_compiled_cache/UnslothGRPOTrainer.py", line 972, in _prepare_inputs
    prompt_completion_ids = unwrapped_model.generate(
  File "/home/ducanh/data/miniconda3/envs/unsloth/lib/python3.10/site-packages/unsloth/models/rl.py", line 69, in generate_with_clone
    out = original_generate(*args, **kwargs)
  File "/home/ducanh/data/miniconda3/envs/uns

<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Qwen-3` team, the recommended settings for reasoning inference are `temperature = 0.6, top_p = 0.95, top_k = 20`

For normal chat based inference, `temperature = 0.7, top_p = 0.8, top_k = 20`

In [None]:
messages = [
    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

In [None]:
messages = [
    {"role" : "user", "content" : """
Evaluate the following answer(s) for IELTS speaking test for the rubric "Grammatical Range and Accuracy". Items to be evaluated: IELTS speaking test part: Part 3 Topic: Cultural Identity Questions: What things make a person's cultural identity? You mention some of the factors that can determine the cultural identity of a person. Which of these do you think the most important? Why does some culture not survive? Some people say some cultures will die because of globalization. What do you say? Topic: Citizenship Questions: What things make a good citizen these days? Is being a good citizen the same in every country? Why? Why might some people wish to be citizens of other country? ``` Answers for evaluating Grammatical Range and Accuracy: I suppose the country where he or she is from because it's different culture he was born and raised in different cultures different literature different music different language and different traditions that's why I suppose all the people are not the same because they have different point of views at different problems and different issues because they are born and raised in different conditions. Traditions, traditions, language, maybe in any languages you even cannot find some words which are, those are in your own language. That's why they, the people who speak, speak those language, that language, they can even don't know about the problems and words like that. Because they weren't adapted to the conditions which were changed. I suppose, I don't know, maybe they just simply assimilated into other cultures and it was easier to assimilate than to support their own culture. I suppose, yes. I can give the example that my grandmother, she's Komi, but she and her husband, who is also Komi, didn't teach their children their language and their traditions because they don't, they didn't think it's useful. Good citizen, I don't know, you can, uh, you have to, uh, just, um, I forgot the word, uh, you have to, um, not to bother, I don't remember, not to bother other people, uh, you have to, um, behave yourself like you, uh, want other people behave yourself with you, that's the only rule, I suppose, not to, um, make any harm or... Is being a good citizen the same in every country? I suppose yes. Because the only... to respect laws, to respect other people, respect... I remember the word... is the only rule to live... I don't know... to be the good citizen, to be the good man or woman or... Maybe they like, they like other countries' rules more or other countries' conditions, I don't know, people in other countries. Because many people in some countries don't realize that they should behave themselves well     """}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = True, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024, # Increase for longer outputs!
    temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [16]:
model.save_pretrained("/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo")  # Local saving
tokenizer.save_pretrained("/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo/tokenizer_config.json',
 '/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo/special_tokens_map.json',
 '/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo/vocab.json',
 '/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo/merges.txt',
 '/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo/added_tokens.json',
 '/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo/tokenizer.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = True,
    )

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.