In [1]:
%%capture
!pip install datasets trl peft bitsandbytes==0.45.0 
!pip install --upgrade pip
!pip install unsloth unsloth_zoo --no-cache-dir --upgrade
!pip install vllm
!pip install --upgrade pillow

## 1. Load the trained model

In [2]:
import unsloth
import torch
from unsloth import FastLanguageModel
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer

# Load the finetuned model
def load_model_tokenizer_ft(base_model_id="Qwen/Qwen2.5-3B-Instruct", adapter_path="grpo_lora"):
    # Load the base model
    base_model, _ = FastLanguageModel.from_pretrained(
        model_name=base_model_id,
        max_seq_length=1024,         # Same as during training
        load_in_4bit=True,           # Load in 4-bit for memory efficiency
    )

    # Apply LoRA weights (need to be the same as before)
    model = FastLanguageModel.get_peft_model(
        base_model,
        r=16,  # Same rank as during training
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ],
        lora_alpha=32,  # Same alpha as during training
        use_gradient_checkpointing="unsloth",  # Enable gradient checkpointing if needed
        random_state=3407,  # Same random state as during training
    )
    # Load the saved LoRA weights
    model.load_adapter(adapter_path,adapter_name="default")
    
    # Set the model to evaluation mode
    model.eval()

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    return model, tokenizer

model,tokenizer=  load_model_tokenizer_ft()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-24 09:44:17 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.19 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


## 2. Prompt routing with embedding model
When a question is given, it is important to tell the model which knowledge it needs to use. Since the model is equiped with math and medical knowledge, there will be 3 choices
- math
- medical
- others

We will use word embedding model (from HuggingFace) to determine which type of system_prompt (math, medical, others) should be applied on the given question.

In [3]:
# Use SentenceTransformer as a LangChain-compatible embedding function
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

  embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")


In [16]:
import numpy as np

prompt_names=["math", "medical", "others"]

prompt_template_math=(
    "You are an expert in solving math problems. "
    "You first think through the reasoning process step-by-step in your mind and then provide the user with the answer."
)

prompt_template_medical=(
    "You are a medical expert with advanced knowledge in clinical reasoning, diagonstics, and treatment planning. "
    "You first think through the reasoning process step-by-step in your mind and then provide the user with the answer."
)

prompt_template_others = (
    f"You are great at answering all questions not from the following themes: {prompt_names[:-1]}. "
    "You first think through the reasoning process step-by-step in your mind and then provide the user with the answer."
)

prompt_templates = [
    prompt_template_math,
    prompt_template_medical,
    prompt_template_others,
]

prompt_embeddings = embedding.embed_documents(prompt_templates)
prompt_embeddings = np.asarray(prompt_embeddings)

print("Embedding dimension: ",len(prompt_embeddings[0]))


NameError: name 'embedding' is not defined

In [6]:

def select_prompt(question):
    """ Select prompt based on question's content: use embedding model to compute the distance between question and prompt templates
        Return:
          (1) prompt template and
          (2) temperature for generation
              Lower temperature for math response (more deterministic)
    """
    question_embedding=embedding.embed_documents([question])
    question_embedding=np.asarray(question_embedding)

    scores=question_embedding@prompt_embeddings.T
    selected_prompt_idx=scores.squeeze(0).argmax()
    # math index is 0
    if selected_prompt_idx==0: # lower temperature 0.1 for math prompt
        return prompt_templates[selected_prompt_idx], 0.1
    # little higher temperature 0.4 for medical prompt
    elif selected_prompt_idx==1:
        return prompt_templates[selected_prompt_idx], 0.4
    # default temperature 0.7 for the rest
    else:
        return prompt_templates[selected_prompt_idx],0.7


In [7]:
question="Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"
print("Selected prompt:\n", select_prompt(question)[0])
print("Temperature: ", select_prompt(question)[1])

Selected prompt:
 You are an expert in solving math problems. You first think through the reasoning process step-by-step in your mind and then provide the user with the answer.
Temperature:  0.1


In [8]:
from transformers import TextStreamer

def generate(question):
    # select the type of prompt and temperature
    system_prompt, temperature=select_prompt(question)
    user_prompt=(
        f"Solve the following question:\n{question}\n"
        "Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.\n"
    )
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Let me solve this step by step.\n<think>"}]
    # chat tempate
    inputs=tokenizer.apply_chat_template(messages, tokenize=False, continue_final_message=True)
    # tokenize
    model_inputs=tokenizer([inputs],return_tensors="pt").to(model.device)
    # generate
    generated_ids=model.generate(
        **model_inputs,
        max_new_tokens=512,           # Adjust for longer responses
        temperature=temperature,      # lower for more deterministic
        streamer = TextStreamer(tokenizer, skip_special_tokens=True)
    )
    # only record the part generated by the model
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    # decode to text
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response


In [9]:
# math question
question="Baldur gets water from a well. He gets 5 pails of water every morning and 6 pails of water every afternoon. If each pail contains 5 liters of water, how many liters of water does he get every day?"
response=generate(question)

system
You are an expert in solving math problems. You first think through the reasoning process step-by-step in your mind and then provide the user with the answer.
user
Solve the following question:
Baldur gets water from a well. He gets 5 pails of water every morning and 6 pails of water every afternoon. If each pail contains 5 liters of water, how many liters of water does he get every day?
Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.

assistant
Let me solve this step by 

step.
<think> First, we need to calculate the total number of pails Baldur gets in a day. He gets 5 pails in the morning and 6 pails in the afternoon, so that's 5 + 6 = 11 pails per day. Each pail contains 5 liters of water, so to find out the total liters of water he gets, we multiply the total number of pails by the liters per pail: 11 * 5 = 55 liters.</think>
<answer>55</answer>


In [10]:
# medical question
question="Which type of thyroid cancer is commonly associated with conditions such as pancreatitis, pituitary tumor, and pheochromocytoma?"
response=generate(question)

system
You are a medical expert with advanced knowledge in clinical reasoning, diagonstics, and treatment planning. You first think through the reasoning process step-by-step in your mind and then provide the user with the answer.
user
Solve the following question:
Which type of thyroid cancer is commonly associated with conditions such as pancreatitis, pituitary tumor, and pheochromocytoma?
Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.

assistant
Let me solve this step by step.
<think> First, 

I need to consider the symptoms and conditions mentioned: pancreatitis, pituitary tumor, and pheochromocytoma. These conditions can be related to certain types of tumors that may arise from the parathyroid or adrenal glands. However, the question specifically asks about thyroid cancer. Among the common types of thyroid cancer (apart from medullary thyroid cancer which is not typically associated with these other conditions), the one that might fit this profile is anaplastic thyroid cancer. Anaplastic thyroid cancer is a rare but aggressive form of thyroid cancer that can occur at any age. It's known to have a strong association with other endocrine tumors, particularly parathyroid adenomas, pituitary tumors, and pheochromocarcinomas. These tumors often coexist with anaplastic thyroid cancer. </think>
<answer> Anaplastic thyroid cancer </answer>


In [11]:
# other types
question="What is the capital of Vietnam? Answer it short"
response=generate(question)

system
You are great at answering all questions not from the following themes: ['math', 'medical']. You first think through the reasoning process step-by-step in your mind and then provide the user with the answer.
user
Solve the following question:
What is the capital of Vietnam? Answer it short
Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.

assistant
Let me solve this step by step.
<think> The question asks 

for the capital of Vietnam. I know that Vietnam is a country in Southeast Asia, and like many countries, it has a capital city. To find the capital, I can recall geographical information or search if necessary, but for this question, I'll use my knowledge base. The capital of Vietnam is Hanoi. This information is widely known and consistently correct across various sources.</think>
<answer>Hanoi</answer>


## 3. Prompt routing with LLM

In [12]:
from transformers import TextStreamer

def select_prompt_llm(model, tokenizer, question):
    # system_message as few-shot prompt
    system_message = """\
    You are a text classifier.  Given an input text, respond with exactly one word: math, medical, or others.  Output only that word—no extra text.

    Examples:
    Text: What is the capital of Vietnam?
    Category: others

    Text: What are symptoms of Covid?
    Category: medical

    Text: How to solve a quadratic equation?
    Category: math"""

    user_message=(
        "Now classify this:\n"
        f"Text: {question}\nCategory: "
    )
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user",   "content": user_message}
    ]

    # apply chat template and tokenize
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    # Generate at most 2 tokens
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=2,
        temperature=0.0,
        do_sample=False,
        streamer=TextStreamer(tokenizer,skip_special_tokens=True)
    )
    # only record the part generated by the model
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
    ]
    # decode to text
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return response

In [13]:
question="Explain Pythagoras theorem?"
response=select_prompt_llm(model,tokenizer,question)

system
    You are a text classifier.  Given an input text, respond with exactly one word: math, medical, or others.  Output only that word—no extra text.

    Examples:
    Text: What is the capital of Vietnam?
    Category: others

    Text: What are symptoms of Covid?
    Category: medical

    Text: How to solve a quadratic equation?
    Category: math
user
Now classify this:
Text: Explain Pythagoras theorem?
Category: 
assistant
math


In [14]:
question="What is LLMs?"
response=select_prompt_llm(model,tokenizer,question)

system
    You are a text classifier.  Given an input text, respond with exactly one word: math, medical, or others.  Output only that word—no extra text.

    Examples:
    Text: What is the capital of Vietnam?
    Category: others

    Text: What are symptoms of Covid?
    Category: medical

    Text: How to solve a quadratic equation?
    Category: math
user
Now classify this:
Text: What is LLMs?
Category: 
assistant
others


In [17]:
prompt_type_temperature={"math": 0.1, "medical": 0.4, "others": 0.7}
prompt_type_system={"math": prompt_template_math, "medical": prompt_template_medical, "others": prompt_template_others}


def generate_llm_routing(question):
    question_type=select_prompt_llm(model,tokenizer,question)

    # get system prompt and temperature
    system_prompt=prompt_type_system[question_type]
    temperature=prompt_type_temperature[question_type]
    user_prompt=(
        f"Solve the following question:\n {question}\n"
        "Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.\n"
    )
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Let me solve this step by step.\n<think>"}]
    # chat tempate
    inputs=tokenizer.apply_chat_template(messages, tokenize=False, continue_final_message=True)
    # tokenize
    model_inputs=tokenizer([inputs],return_tensors="pt").to(model.device)
    # generate
    generated_ids=model.generate(
        **model_inputs,
        max_new_tokens=512,           # Adjust for longer responses
        temperature=temperature,      # lower for more deterministic
        streamer = TextStreamer(tokenizer, skip_special_tokens=True)
    )
    # only record the part generated by the model
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    # decode to text
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response


In [18]:
question="What is LLMs?"
generate_llm_routing(question)

system
    You are a text classifier.  Given an input text, respond with exactly one word: math, medical, or others.  Output only that word—no extra text.

    Examples:
    Text: What is the capital of Vietnam?
    Category: others

    Text: What are symptoms of Covid?
    Category: medical

    Text: How to solve a quadratic equation?
    Category: math
user
Now classify this:
Text: What is LLMs?
Category: 
assistant
others
system
You are great at answering all questions not from the following themes: ['math', 'medical']. You first think through the reasoning process step-by-step in your mind and then provide the user with the answer.
user
Solve the following question:
 What is LLMs?
Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.

assistant
Let me solve this step by step.
<think> The question "What is LLMs?" is quite broad and could have multiple interpretations depending on the context. However, it's likely referring to Large L

' The question "What is LLMs?" is quite broad and could have multiple interpretations depending on the context. However, it\'s likely referring to Large Language Models, which are a type of artificial intelligence models used in natural language processing. These models are trained on vast amounts of text data to understand and generate human-like language. However, without additional context, it\'s hard to give a more specific definition. </think>\n<answer> Large Language Models (LLMs) are a type of artificial intelligence model that are trained on large datasets of text to understand and generate human-like language. They are widely used in natural language processing tasks such as text generation, translation, and summarization. </answer>'

In [20]:
# math question
question="Baldur gets water from a well. He gets 5 pails of water every morning and 6 pails of water every afternoon. If each pail contains 5 liters of water, how many liters of water does he get every day?"
response=generate_llm_routing(question)

system
    You are a text classifier.  Given an input text, respond with exactly one word: math, medical, or others.  Output only that word—no extra text.

    Examples:
    Text: What is the capital of Vietnam?
    Category: others

    Text: What are symptoms of Covid?
    Category: medical

    Text: How to solve a quadratic equation?
    Category: math
user
Now classify this:
Text: Baldur gets water from a well. He gets 5 pails of water every morning and 6 pails of water every afternoon. If each pail contains 5 liters of water, how many liters of water does he get every day?
Category: 
assistant
math
system
You are an expert in solving math problems. You first think through the reasoning process step-by-step in your mind and then provide the user with the answer.
user
Solve the following question:
 Baldur gets water from a well. He gets 5 pails of water every morning and 6 pails of water every afternoon. If each pail contains 5 liters of water, how many liters of water does he get 

In [21]:
# medicine question
question="How to treat Covid?"
response=generate_llm_routing(question)

system
    You are a text classifier.  Given an input text, respond with exactly one word: math, medical, or others.  Output only that word—no extra text.

    Examples:
    Text: What is the capital of Vietnam?
    Category: others

    Text: What are symptoms of Covid?
    Category: medical

    Text: How to solve a quadratic equation?
    Category: math
user
Now classify this:
Text: How to treat Covid?
Category: 
assistant
medical
system
You are a medical expert with advanced knowledge in clinical reasoning, diagonstics, and treatment planning. You first think through the reasoning process step-by-step in your mind and then provide the user with the answer.
user
Solve the following question:
 How to treat Covid?
Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.

assistant
Let me solve this step by step.
<think> First, it's important to understand that treating COVID-19 involves a combination of supportive care and specific treatmen