### Unsloth

In [1]:
from unsloth import FastLanguageModel
import torch
fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",
    

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
INFO 07-08 11:34:16 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-08 11:34:16 [__init__.py:239] Automatically detected platform cuda.


2025-07-08 11:34:19,300	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Qwen-3` team, the recommended settings for reasoning inference are `temperature = 0.6, top_p = 0.95, top_k = 20`

For normal chat based inference, `temperature = 0.7, top_p = 0.8, top_k = 20`

In [2]:
def infer(model, tokenizer, messages, mlo=500):
    text = tokenizer.apply_chat_template(
        messages,
        tokenize = False,
        add_generation_prompt = True, # Must add for generation
        enable_thinking = True, # Disable thinking
    )

    
    output_tokens = model.generate(
        **tokenizer(text, return_tensors = "pt").to("cuda"),
        max_new_tokens = mlo, # Increase for longer outputs!
        temperature = 1.0, top_p = 0.7, top_k = 20, # For thinking
        # streamer = TextStreamer(tokenizer, skip_prompt = True),
    )
    output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    return output_text  # Return the generated text instead of token IDs

In [3]:
# Linear Regression
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd
import json
import re
def get_linest(y_values_poly, x_values_poly):
    # Convert to NumPy arrays and reshape for sklearn
    X_poly = np.array(x_values_poly).reshape(-1, 1)
    Y_poly = np.array(y_values_poly)

    # Define the polynomial degree (degree 2 for quadratic fit)
    degree = 1

    # Transform the data to include polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly_transformed = poly.fit_transform(X_poly)

    # Fit the polynomial regression model
    model_poly = LinearRegression()
    model_poly.fit(X_poly_transformed, Y_poly)

    return model_poly.coef_, model_poly.intercept_

def get_mapping(coefs, intercept, x_values):
    return [max(1,min(9,round(intercept+sum([coef*x_val**degree for degree, coef in enumerate(coefs)])))) for x_val in x_values]

def read_jsonfile(file_path="/home/ducanh/data/nemoWS/datasets/hey2/ielts-marking/raw_input4unsloth_gra.json"):
    all_samples = []
    # Open and read the file line by line
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data = json.loads(line)  # Parse the JSON object from each line
            # print(data)              # Do something with the JSON object
            all_samples += [data]
    return all_samples

def test(model_path="/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo", mlo=400, lora_rank=16, datafile="/home/ducanh/data/nemoWS/datasets/hey2/ielts-marking/writing/task1-test.json"):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path ,
        max_seq_length = 2048,   # Context length - can be longer, but uses more memory
        load_in_4bit = True, # 4bit uses much less memory
        load_in_8bit = False,    # A bit more accurate, uses 2x memory
        full_finetuning = False, # We have full finetuning now!
        max_lora_rank = lora_rank,
        gpu_memory_utilization = 0.6, # Reduce if out of memory
        # token = "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",      # use one if using gated models
    )
    import os
    filename = os.path.basename(datafile) 
    gtests = read_jsonfile(datafile)
    predictions = []
    for i in gtests:
        messages = [{"role" : "user", "content" : i['input']}]
        output = i['output']
        predict = infer(model, tokenizer, messages, mlo=mlo).split("</think>")[-1].strip()
        print(predict)
        print('-------------------\n\n')
        predictions.append({"input":i['input'], "label":output, "pred":predict})
        print({"label":output, "pred":predict})   
    with open(f"{model_path}/{filename }.json", "w", encoding="utf-8") as test_f:
        for sample in predictions:
            json.dump(sample, test_f)
            test_f.write("\n")


    def get_score(text):
        """
        Extracts the last numerical score from the input text.

        Parameters:
            text (str): The input text containing scores.

        Returns:
            float or None: The last score found, or None if not present.
        """
        # Finds all numbers including decimals
        matches = re.findall(r'\d+(?:\.\d+)?', text)
        print("-->",matches)
        if matches:
            return float(matches[-1])  # Return the last one
        return None

    # Assuming the JSONL file has 'input', 'label', and 'prediction' fields
    gra_data = read_jsonfile(f"{model_path}/{filename }.json")
    # gra_input = [gra_item['label'] for gra_item in gra_data]
    gra_target = [float(gra_item['label'].split(" ")[-1] if isinstance(gra_item['label'], str) else gra_item['label']) for gra_item in gra_data]
    gra_preds = [get_score(gra_item['pred']) for gra_item in gra_data]
    gra_linest = get_linest(gra_target, gra_preds)


    gra_df = pd.DataFrame({
        # "input": gra_input,
        "target": gra_target,
        "pred": gra_preds,
        "acc":[item_target-1<=item<=item_target+1 for item_target,item in zip(gra_target,gra_preds)],
        "pred_mapped": get_mapping(gra_linest[0], gra_linest[1], gra_preds),
        "acc_mapped": [item_target-1<=item<=item_target+1 for item_target,item in zip(gra_target,get_mapping(gra_linest[0], gra_linest[1], gra_preds))],
    })

    
    print(f"Grammatical Range and Accuracy mapping coef and intercept:\nCoefficients: {gra_linest[0]}\nIntercept: {gra_linest[1]}")
    print()
    # print(f"Lexical Resource mapping coef and intercept:\nCoefficients: {lr_linest[0]}\nIntercept: {lr_linest[1]}")
    # Counting accurate predictions/total predictions
    print(f"Grammatical Range and Accuracy rubric accuracy:\nUnmapped: {round(gra_df['acc'].sum()/gra_df['acc'].count()*100,2)}%\nMapped: {round(gra_df['acc_mapped'].sum()/gra_df['acc_mapped'].count()*100,2)}%")
    print()
    # print(f"Lexical Resource rubric accuracy:\nUnmapped: {round(lr_df['acc'].sum()/lr_df['acc'].count()*100,2)}%\nMapped: {round(lr_df['acc_mapped'].sum()/lr_df['acc_mapped'].count()*100,2)}%")

    with open(f"{model_path}/log.txt", "w") as file:
        file.write(f"Grammatical Range and Accuracy mapping coef and intercept:\nCoefficients: {gra_linest[0]}\nIntercept: {gra_linest[1]}\n\n")
        
        # file.write(f"Lexical Resource mapping coef and intercept:\nCoefficients: {lr_linest[0]}\nIntercept: {lr_linest[1]}\n\n")
        
        # Counting accurate predictions/total predictions
        file.write(f"Grammatical Range and Accuracy rubric accuracy:\nUnmapped: {round(gra_df['acc'].sum()/gra_df['acc'].count()*100,2)}%\nMapped: {round(gra_df['acc_mapped'].sum()/gra_df['acc_mapped'].count()*100,2)}%\n\n")
        
        # file.write(f"Lexical Resource rubric accuracy:\nUnmapped: {round(lr_df['acc'].sum()/lr_df['acc'].count()*100,2)}%\nMapped: {round(lr_df['acc_mapped'].sum()/lr_df['acc_mapped'].count()*100,2)}%\n")

In [5]:
def test(model_path="/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_qrpo", mlo=400, lora_rank=16, datafile="/home/ducanh/data/nemoWS/datasets/hey2/ielts-marking/writing/task1-test.json"):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path ,
        max_seq_length = 2048,   # Context length - can be longer, but uses more memory
        load_in_4bit = True, # 4bit uses much less memory
        load_in_8bit = False,    # A bit more accurate, uses 2x memory
        full_finetuning = False, # We have full finetuning now!
        max_lora_rank = lora_rank,
        gpu_memory_utilization = 0.6, # Reduce if out of memory
        # token = "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",      # use one if using gated models
    )
    import os
    filename = os.path.basename(datafile) 
    gtests = read_jsonfile(datafile)
    predictions = []
    for idx, i in enumerate(gtests):
        print("==========", idx, "=======")
        messages = [{"role" : "user", "content" : i['input']}]
        output = i['output']
        predict = infer(model, tokenizer, messages, mlo=mlo).split("</think>")[-1].strip()
        print(predict)
        print('-------------------\n\n')
        pred_dict = {"input":i['input'], "label":output, "pred":predict}
        if 'qid' in i:
            pred_dict['qid'] = i['qid']
        predictions.append(pred_dict)
        print({"label":output, "pred":predict})   
    with open(f"{model_path}/{filename }.json", "w", encoding="utf-8") as test_f:
        for sample in predictions:
            json.dump(sample, test_f)
            test_f.write("\n")
paths = [
    "/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_simpleft_full16_writingT1_s2000",
]
for path in paths:
    print("\n\n")
    print("\t\t===="*2)
    print("load model from ", path)
    test(path, mlo=4000, datafile="/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/simple/data/test-task1-4qwen.json")




		====		====
load model from  /home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_simpleft_full16_writingT1_s2000
==((====))==  Unsloth 2025.4.8: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.252 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:04<00:00,  1.47s/it]
Unsloth 2025.4.8 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


{
        "mark": 1,
        "overal-feedback": "Task Achievement: The response completely fails to address the task. The prompt is about a bar chart showing educational attainment levels in the United States, but the student's response talks about money, the government paying for healthcare, and poverty. There's no relevant information provided about the chart or its trends. The response is off-topic.

Coherence & Cohesion: The response lacks any logical organization or clear progression of ideas. The sentences are loosely connected and jump from one topic to another without coherence. There's minimal use of cohesive devices, making it difficult to follow the argument. Paragraphing is rudimentary and doesn't structure the response effectively.

Lexical Resource: The vocabulary is very basic and limited. There's frequent repetition of simple words and phrases. The language used is inappropriate for an academic task, and there's a lack of precision in word choice. The response relies he

In [6]:
paths = [
    "/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_simpleft_full16_writingT2_s2000",
]
for path in paths:
    print("\n\n")
    print("\t\t===="*2)
    print("load model from ", path)
    test(path, mlo=4000, datafile="/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/simple/data/test-task2-4qwen.json")




		====		====
load model from  /home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_simpleft_full16_writingT2_s2000


==((====))==  Unsloth 2025.4.8: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.252 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:06<00:00,  2.19s/it]


{
        "mark": 1,
        "overal-feedback": "Task Achievement: The response is completely unrelated to the task. It discusses home ownership and renting, which has no connection to the prompt about students' success and the role of teachers versus student attitude. The student expresses a strong preference for owning a home, believing it allows for more freedom and happiness. This does not address the prompt's core question of whether a student's success in education depends more on their teachers or their own attitude and effort.

Coherence & Cohesion: The response lacks logical organization and coherence. Ideas are presented in a disjointed manner, with minimal use of cohesive devices. The paragraphs are short and lack a clear progression of thought. The relationships between ideas are unclear, and there is no clear argumentative structure.

Lexical Resource: The response demonstrates a very limited vocabulary. Words are basic and repetitive, with frequent errors in word choice. 

In [None]:
paths = [
    # "/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_simpleft_full16_gra",
    # "/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/qrpo/outputs/lora_Qwen14b_qrpo_qkvo16/checkpoint-300",
    # "/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/qrpo/outputs/lora_Qwen14b_qrpo_full16_ogra/checkpoint-150",
    "/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_simpleft_full16_writingT2_s400",
    # "/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_simpleft_full16_writingT1_s400",
    # "/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_Qwen14b_simpleft_full16_writingT1_s100",

]
for path in paths:
    print("\n\n")
    print("\t\t===="*2)
    print("load model from ", path)
    test(path, mlo=4000, datafile="/home/ducanh/data/nemoWS/datasets/hey2/ielts-marking/writing/task2-test.json")

In [None]:
paths = [
    # "/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/qrpo/outputs/lora_Qwen14b_qrpo_qkvo16/checkpoint-600/gap.json",
    # "/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/qrpo/outputs/lora_Qwen14b_qrpo_qkvo16/checkpoint-550/gap.json",
    "/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/qrpo/outputs/lora_Qwen14b_qrpo_full16_ogra/checkpoint-200/gap.json",
    "/home/ducanh/nvidia-llm-pipeline/unslot/saves/llamaFactory/gap_phi4.json",
         
]

def get_score(text:str):
    try:
        return float(text.split(":")[-1].strip())
    except:
        print("Error:", text)
        return 0

for p in paths:
    print("\n===============")
    print(p)
    predictions = []
    # Assuming the JSONL file has 'input', 'label', and 'prediction' fields
    gra_data = read_jsonfile(p)
    # gra_input = [gra_item['label'] for gra_item in gra_data]
    gra_target = [float(gra_item['label'].split(" ")[-1]) for gra_item in gra_data]
    gra_preds = [get_score(gra_item['pred']) for gra_item in gra_data]
    gra_linest = get_linest(gra_target, gra_preds)

    gra_df = pd.DataFrame({
        # "input": gra_input,
        "target": gra_target,
        "pred": gra_preds,
        "acc":[item_target-1<=item<=item_target+1 for item_target,item in zip(gra_target,gra_preds)],
        "pred_mapped": get_mapping(gra_linest[0], gra_linest[1], gra_preds),
        "acc_mapped": [item_target-1<=item<=item_target+1 for item_target,item in zip(gra_target,get_mapping(gra_linest[0], gra_linest[1], gra_preds))],
    })

    # print(f"Grammatical Range and Accuracy mapping coef and intercept:\nCoefficients: {gra_linest[0]}\nIntercept: {gra_linest[1]}")
    print(f"Grammatical Range and Accuracy rubric accuracy:\nUnmapped: {round(gra_df['acc'].sum()/gra_df['acc'].count()*100,2)}%\nMapped: {round(gra_df['acc_mapped'].sum()/gra_df['acc_mapped'].count()*100,2)}%")
    print()

In [None]:
gtests = read_jsonfile("/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/test_gra.json")
for i in gtests:
    print(i['input'])
    print("\n\t\t=============\n")
    input()

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_test_model")  # Local saving
tokenizer.save_pretrained("/home/ducanh/nvidia-llm-pipeline/unslot/saves/lora_test_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = True,
    )