In [None]:
import unsloth
import torch
from unsloth import FastLanguageModel
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer

# Load the finetuned model
def load_model_tokenizer_ft(base_model_id="Qwen/Qwen2.5-3B-Instruct", adapter_path="grpo_lora"):
    # Load the base model
    base_model, _ = FastLanguageModel.from_pretrained(
        model_name=base_model_id,
        max_seq_length=1024,         # Same as during training
        load_in_4bit=True,           # Load in 4-bit for memory efficiency
    )

    # Apply LoRA weights (need to be the same as before)
    model = FastLanguageModel.get_peft_model(
        base_model,
        r=16,  # Same rank as during training
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ],
        lora_alpha=32,  # Same alpha as during training
        use_gradient_checkpointing="unsloth",  # Enable gradient checkpointing if needed
        random_state=3407,  # Same random state as during training
    )
    # Load the saved LoRA weights
    model.load_adapter(adapter_path,adapter_name="default")
    # Set the model to evaluation mode
    model.eval()

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    return model, tokenizer

model,tokenizer=  load_model_tokenizer_ft()

## 1. Prompt routing with embedding model
When a question is given, it is important to tell the model which knowledge it needs to use. Since the model is equiped with math and medical knowledge, there will be 3 choices
- math
- medical
- others

We will use word embedding to determine which type of system_prompt (math, medical, others) should be applied on the given question.

In [None]:
# Use SentenceTransformer as a LangChain-compatible embedding function
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

In [None]:
import numpy as np

prompt_names=["math", "medical", "others"]

prompt_template_math=(
    "You are an expert in solving math problems. "
    "You first think through the reasoning process step-by-step in your mind and then provide the user with the answer."
)

prompt_template_medical=(
    "You a medical expert with advanced knowledge in clinical reasoning, diagonstics, and treatment planning. "
    "You first think through the reasoning process step-by-step in your mind and then provide the user with the answer."
)

prompt_template_others = (
    f"You are great at answering all questions not from the following themes: {prompt_names[:-1]}. "
    "You first think through the reasoning process step-by-step in your mind and then provide the user with the answer."
)

prompt_templates = [
    prompt_template_math,
    prompt_template_medical,
    prompt_template_others,
]
prompt_embeddings = embedding.embed_documents(prompt_templates)
prompt_embeddings=np.asarray(prompt_embeddings)
print("Embedding dimension: ",len(prompt_embeddings[0]))


In [2]:

def select_prompt(question):
    """ Select prompt based on question's content: use embedding model to compute the distance between question and prompt templates
        Return prompt templates and temperature for generation
        Lower temperature for math response (more deterministic)
    """
    question_embedding=embedding.embed_documents([question])
    question_embedding=np.asarray(question_embedding)
    scores=question_embedding@prompt_embeddings.T
    selected_prompt_idx=scores.squeeze(0).argmax()
    # math index is 0
    if selected_prompt_idx==0: # lower temperature 0.1 for math prompt
        return prompt_templates[selected_prompt_idx],0.1
    # little higher temperature 0.4 for medical prompt
    elif selected_prompt_idx==1:
        return prompt_templates[selected_prompt_idx],0.4
    # default temperature 0.7 for the rest
    else:
        return prompt_templates[selected_prompt_idx],0.7


In [None]:
question="Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"
print("Selected prompt:\n", select_prompt(question)[0])
print("Temperature: ", select_prompt(question)[1])

In [None]:
from transformers import TextStreamer

def generate(question):
    system_prompt,temperature=select_prompt(question)
    user_prompt=(
        f"Solve the following question:\n {question}\n"
        "Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.\n"
    )
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Let me solve this step by step.\n<think>"}]
    # chat tempate
    inputs=tokenizer.apply_chat_template(messages, tokenize=False, continue_final_message=True)
    # tokenize
    model_inputs=tokenizer([inputs],return_tensors="pt").to(model.device)
    # generate
    generated_ids=model.generate(
        **model_inputs,
        max_new_tokens=1024,  # Adjust for longer responses
        temperature=temperature,      # lower for more deterministic
        streamer = TextStreamer(tokenizer, skip_special_tokens=True)
    )
    # only record the part generated by the model
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    # decode to text
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response


In [None]:
# math question
question="Baldur gets water from a well. He gets 5 pails of water every morning and 6 pails of water every afternoon. If each pail contains 5 liters of water, how many liters of water does he get every day?"
response=generate(question)

In [None]:
# medical question
question="Which type of thyroid cancer is commonly associated with conditions such as pancreatitis, pituitary tumor, and pheochromocytoma?"
response=generate(question)

In [None]:
# other types
question="What is the capital of Vietnam? Answer it short"
response=generate(question)

## 2. Prompt routing with LLM

In [None]:

def select_prompt_llm(model, tokenizer, question):
    # system_message as few-shot prompt
    system_message = """\
    You are a text classifier.  Given an input text, respond with exactly one word: math, medical, or others.  Output only that word—no extra text.

    Examples:
    Text: What is the capital of Vietnam?
    Category: others

    Text: What are symptoms of Covid?
    Category: medical

    Text: How to solve a quadratic equation?
    Category: math"""

    user_message=(
        "Now classify this:\n"
        f"Text: {question}\nCategory: "
    )
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user",   "content": user_message}
    ]

    # apply chat template and tokenize
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        continue_final_message=True
    )
    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    # Generate at most 2 tokens 
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=2,
        eos_token_id=tokenizer.eos_token_id,
        temperature=0.0,
        do_sample=False,
        streamer=TextStreamer(tokenizer, skip_special_tokens=True)
    )
    # only record the part generated by the model
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
    ]
    # decode to text
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return response


In [None]:
question="What is the pythagoras theorem?"
response=select_prompt_llm(model,tokenizer,question)

In [None]:
prompt_type_temperature={"math": 0.1, "medical": 0.4, "others": 0.7}
prompt_type_system={"math": prompt_template_math, "medical": prompt_template_medical, "others": prompt_template_others}


def generate_llm_routing(question):
    print("Determining the type of question ...")
    question_type=select_prompt_llm(model,tokenizer,question)
    
    # get system prompt and temperature
    system_prompt=prompt_type_system[question_type]
    temperature=prompt_type_temperature[question_type]
    user_prompt=(
        f"Solve the following question:\n {question}\n"
        "Show your reasoning in <think> </think> tags. And return the final answer in <answer> </answer> tags.\n"
    )
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Let me solve this step by step.\n<think>"}]
    # chat tempate
    inputs=tokenizer.apply_chat_template(messages, tokenize=False, continue_final_message=True)
    # tokenize
    model_inputs=tokenizer([inputs],return_tensors="pt").to(model.device)
    # generate
    generated_ids=model.generate(
        **model_inputs,
        max_new_tokens=512,           # Adjust for longer responses
        temperature=temperature,      # lower for more deterministic
        streamer = TextStreamer(tokenizer, skip_special_tokens=True)
    )
    # only record the part generated by the model
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    # decode to text
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response


In [None]:
# normal question
question="How to treat Covid?"
response=generate_llm_routing(question)