In [1]:
%%capture
!pip install datasets trl peft bitsandbytes==0.45.0 
!pip install --upgrade pip
!pip install unsloth unsloth_zoo --no-cache-dir --upgrade
!pip install vllm
!pip install --upgrade pillow

In [2]:
import unsloth, torch
from transformers import AutoTokenizer, TextStreamer
from unsloth import FastLanguageModel

def load_model_tokenizer_ft(base_model_id="Qwen/Qwen2.5-3B-Instruct", adapter_path="grpo_lora"):
    # Load the base model
    base_model, _ = FastLanguageModel.from_pretrained(
        model_name=base_model_id,
        max_seq_length=1024,         # Same as during training
        load_in_4bit=True,           # Load in 4-bit for memory efficiency
        fast_inference=True,         # Enable fast inference
)

    # Apply LoRA weights (need to be the same as before)
    model = FastLanguageModel.get_peft_model(
        base_model,
        r=16,  # Same rank as during training
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ],
        lora_alpha=32,  # Same alpha as during training
        use_gradient_checkpointing="unsloth",  # Enable gradient checkpointing if needed
        random_state=3407,  # Same random state as during training
    )
    # Load the saved LoRA weights
    model.load_adapter(adapter_path,adapter_name="default")
    # Set the model to evaluation mode
    model.eval()

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    return base_model, model, tokenizer

model_base, model_ft, tokenizer =  load_model_tokenizer_ft()


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-07 08:37:31 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 49.49%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 22.17 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 8.55 GB. Al



INFO 04-07 08:37:45 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-07 08:37:46 model_runner.py:1115] Loading model weights took 2.2160 GB
INFO 04-07 08:37:46 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-07 08:37:49 worker.py:267] Memory profiling takes 2.26 seconds
INFO 04-07 08:37:49 worker.py:267] the current vLLM instance can use total_gpu_memory (22.17GiB) x gpu_memory_utilization (0.49) = 10.97GiB
INFO 04-07 08:37:49 worker.py:267] model weights take 2.22GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 1.23GiB; the rest of the memory reserved for KV Cache is 7.48GiB.
INFO 04-07 08:37:49 executor_base.py:111] # cuda blocks: 13623, # CPU blocks: 10922
INFO 04-07 08:37:49 executor_base.py:116] Maximum concurrency for 1024 tokens per request: 212.86x
INFO 04-07 08:37:55 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 31/31 [00:26<00:00,  1.19it/s]

INFO 04-07 08:38:21 model_runner.py:1562] Graph capturing finished in 26 secs, took 4.41 GiB
INFO 04-07 08:38:21 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 34.42 seconds



Unsloth 2025.3.19 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [4]:
import re
import math


# extract tool call from the response
def extract_tool_call(text):
  import io
  from contextlib import redirect_stdout
  # pattern: tool_code + white spaces + character + white spaces
  pattern=r"```tool_code\s*(.*?)\s*```"
  matches=re.search(pattern, text, re.DOTALL)
  if matches:
      tool_code=matches.group(1).strip()
      # capture stdout in a string buffer: to capture the output generated by the function eval(tool_code), e.g. function contains print statement
      f=io.StringIO()
      # eval executes the extracted code as if it were a Python expression
      with redirect_stdout(f):
          result=eval(tool_code)
      output=f.getvalue()
      r=result if output== '' else output
      return f'```tool_output\n{str(r).strip()}\n```'
  return None


def convert(amount: float, currency: str, new_currency: str) -> float:
  return amount * 0.9


def get_exchange_rate(currency: str, new_currency: str) -> float:
    return 1.2


def bezout(a,b):
    # swap a and b if necessary
    swap=False # indicate if we need to swap and b
    if a<b:
        swap=True
        a,b=b,a

    # keep track the list of remainders, coefficients a[i] and b[i], quotients q[i]
    remainders=[a,b]   # store remainders r[0], r[1], r[2], ...
    coeff_a=[1,0]      # store coefficients a[0], a[1], a[2], ...
    coeff_b=[0,1]      # store coefficients b[0], b[1], b[2], ...
    quotients=list()   # store quotients q[0], q[1], q[2], ...

    while b>0:
        # continously divide a by b and update them
        q=a//b
        a,b=b,a-b*q

        # update the lists
        remainders.append(b)
        quotients.append(q)
        coeff_a.append(coeff_a[-2]-q*coeff_a[-1])
        coeff_b.append(coeff_b[-2]-q*coeff_b[-1])

    if swap:
        return coeff_b[-2], coeff_a[-2]
    else:
        return coeff_a[-2], coeff_b[-2]

def solve(a,b,m):
    d=math.gcd(a,m)

    # no solution if b is not divisible by d
    if b%d!=0:
        print("No solution!")

    # divide a,b,m by d and solve the resulting equation
    else:
        a,b,m=a//d, b//d, m//d
        a_inverse=bezout(a,m)[0]
        x=a_inverse*b % m
        print(f"Solution : x = {x} (mod {m})")

instruction_prompt_with_function_calling = '''
At each turn, if you decide to invoke any of the functions, you must wrap the function call with triple backticks and the label "tool_code". You are given several Python functions that you can use, and you are only allowed to call these functions. The generated code should be readable, efficient, and match exactly the provided function signature.

When a function call is made, its output will be wrapped in triple backticks with the label "tool_output". Use that output to guide any further tool calls or to generate a friendly response.

Your task is to determine which of the following functions to call based solely on the user's request. If the request refers to computing the Bezout coefficients, then your answer must be exactly a function call to `bezout` with the appropriate parameters, wrapped as shown below.

For example, if the user asks: "What are bezout coefficients of 12 and 21?" then you must respond with:
```tool_code
bezout(a=12, b=21)
```

The available Python methods are:

```python
def convert(amount: float, currency: str, new_currency: str) -> float:
    """Convert the currency with the latest exchange rate

    Args:
      amount: The amount of currency to convert
      currency: The currency to convert from
      new_currency: The currency to convert to
    """

def get_exchange_rate(currency: str, new_currency: str) -> float:
    """Get the latest exchange rate for the currency pair

    Args:
      currency: The currency to convert from
      new_currency: The currency to convert to
    """

def bezout(a:int , b: int) -> List[int]:
    """Compute the Bezout coefficients of a and b

    Args:
      a: The first number
      b: The second number
    """

def solve(a,b,m):
    """Solve the linear congruence equation ax=b (mod m)

    Args:
      a: The coefficient of x in the equation
      b: The right-hand side of the equation
      m: The modulus of the equation
    """
```

User: {user_message}
'''

## 1. Base model

In [11]:
class Agent_Base:
    def __init__(self, system_message="", stream_output=True):
        self.messages = []
        self.stream_output = stream_output
        if system_message:
            self.messages = [{"role": "system", "content": system_message}]

    def __call__(self, message, stream=None):
        # Use parameter stream if provided, otherwise use instance default
        should_stream = self.stream_output if stream is None else stream
        
        # user message
        self.messages.append({"role": "user", "content": message})
        response = self.execute(stream=should_stream)

        # Add the assistant's response to the message history
        self.messages.append({"role": "assistant", "content": response})
        return response

    def execute(self, stream=True):
        # apply chat template 
        text = tokenizer.apply_chat_template(
            self.messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer(text, return_tensors="pt").to("cuda")

        # Only use TextStreamer if streaming is enabled
        streamer = TextStreamer(tokenizer, skip_special_tokens=True) if stream else None
        
        generated_ids = model_base.generate(
            **model_inputs,
            # temperature=0.1,  # Control randomness (lower = more deterministic)
            # top_p=0.9,   
            streamer=streamer
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return response

def query(question):
    # initialize agent
    bot = Agent_Base()
    # get instruction prompt
    prompt = instruction_prompt_with_function_calling.format(user_message=question)
    # get tool call - stream this first interaction
    response = bot(prompt, stream=True)         
    # get output of tool function
    call_response = extract_tool_call(response)
    if call_response is None:
        return None
    print("Tool call:\n", call_response)

    # don't stream the second interaction
    answer = bot(call_response, stream=False)
    return answer


# question = "What are bezout coefficients of 12 and 21?"
# answer = query(question)
# print(answer)

In [13]:
question = "Solve the linear congruence equation 12x=1 (mod 27)"
answer = query(question)
print(answer)

system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user

At each turn, if you decide to invoke any of the functions, you must wrap the function call with triple backticks and the label "tool_code". You are given several Python functions that you can use, and you are only allowed to call these functions. The generated code should be readable, efficient, and match exactly the provided function signature.

When a function call is made, its output will be wrapped in triple backticks with the label "tool_output". Use that output to guide any further tool calls or to generate a friendly response.

Your task is to determine which of the following functions to call based solely on the user's request. If the request refers to computing the Bezout coefficients, then your answer must be exactly a function call to `bezout` with the appropriate parameters, wrapped as shown below.

For example, if the user asks: "What are bezout coefficients of 12 and 21?" then you must resp

## 2. Fine-tuned model

In [7]:
class Agent_ft:
    def __init__(self, system_message="", stream_output=True):
        self.messages = []
        self.stream_output = stream_output
        if system_message:
            self.messages = [{"role": "system", "content": system_message}]

    def __call__(self, message, stream=None):
        # Use parameter stream if provided, otherwise use instance default
        should_stream = self.stream_output if stream is None else stream
        
        # user message
        self.messages.append({"role": "user", "content": message})
        response = self.execute(stream=should_stream)

        # Add the assistant's response to the message history
        self.messages.append({"role": "assistant", "content": response})
        return response

    def execute(self, stream=True):
        # apply chat template 
        text = tokenizer.apply_chat_template(
            self.messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer(text, return_tensors="pt").to("cuda")

        # Only use TextStreamer if streaming is enabled
        streamer = TextStreamer(tokenizer, skip_special_tokens=True) if stream else None
        
        generated_ids = model_ft.generate(
            **model_inputs,
            # temperature=0.1,  # Control randomness (lower = more deterministic)
            # top_p=0.9,   
            streamer=streamer
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return response

def query_ft(question):
    # initialize agent
    bot = Agent_ft()
    # get instruction prompt
    prompt = instruction_prompt_with_function_calling.format(user_message=question)
    # get tool call - stream this first interaction
    response = bot(prompt, stream=True)         
    # get output of tool function
    call_response = extract_tool_call(response)
    if call_response is None:
        return None
    print("Tool call:\n", call_response)

    # don't stream the second interaction
    answer = bot(call_response, stream=False)
    return answer


question = "What are bezout coefficients of 12 and 21?"
answer = query_ft(question)
print(answer)

system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user

At each turn, if you decide to invoke any of the functions, you must wrap the function call with triple backticks and the label "tool_code". You are given several Python functions that you can use, and you are only allowed to call these functions. The generated code should be readable, efficient, and match exactly the provided function signature.

When a function call is made, its output will be wrapped in triple backticks with the label "tool_output". Use that output to guide any further tool calls or to generate a friendly response.

Your task is to determine which of the following functions to call based solely on the user's request. If the request refers to computing the Bezout coefficients, then your answer must be exactly a function call to `bezout` with the appropriate parameters, wrapped as shown below.

For example, if the user asks: "What are bezout coefficients of 12 and 21?" then you must resp

In [14]:
question = "Solve the linear congruence equation 36x=3 (mod 27)"
answer = query_ft(question)
print(answer)

system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user

At each turn, if you decide to invoke any of the functions, you must wrap the function call with triple backticks and the label "tool_code". You are given several Python functions that you can use, and you are only allowed to call these functions. The generated code should be readable, efficient, and match exactly the provided function signature.

When a function call is made, its output will be wrapped in triple backticks with the label "tool_output". Use that output to guide any further tool calls or to generate a friendly response.

Your task is to determine which of the following functions to call based solely on the user's request. If the request refers to computing the Bezout coefficients, then your answer must be exactly a function call to `bezout` with the appropriate parameters, wrapped as shown below.

For example, if the user asks: "What are bezout coefficients of 12 and 21?" then you must resp

## 3. Finetuned model with chain of thought

In [17]:
instruction_prompt = '''
At each turn, if you decide to invoke any of the functions, you must wrap the function call with triple backticks and the label "tool_code". You are given several Python functions that you can use, and you are only allowed to call these functions. 

When a function call is made, its output will be wrapped in triple backticks with the label "tool_output". Use that output to guide any further tool calls or to generate a friendly response.

Your task is to determine which of the following functions to call based solely on the user's request. If the request is not relevant to any function, simply answer it without invoking any function call. 

For example, if the user asks: "What are bezout coefficients of 12 and 21?" then you must respond with:
```tool_code
bezout(a=12, b=21)
```

The available Python methods are:

```python
def convert(amount: float, currency: str, new_currency: str) -> float:
    """Convert the currency with the latest exchange rate

    Args:
      amount: The amount of currency to convert
      currency: The currency to convert from
      new_currency: The currency to convert to
    """

def get_exchange_rate(currency: str, new_currency: str) -> float:
    """Get the latest exchange rate for the currency pair

    Args:
      currency: The currency to convert from
      new_currency: The currency to convert to
    """

def bezout(a:int , b: int) -> List[int]:
    """Compute the Bezout coefficients of a and b

    Args:
      a: The first number
      b: The second number
    """

def solve(a,b,m):
    """Solve the linear congruence equation ax=b (mod m)

    Args:
      a: The coefficient of x in the equation
      b: The right-hand side of the equation
      m: The modulus of the equation
    """
```

User: {user_message}
'''

In [24]:
instruction_prompt = '''
You are a specialized assistant provided with a set of Python functions to perform specific computational tasks. Your task is to decide whether to answer the user's request with a plain text response or to call one of the provided functions.

When a function call is made, its output will be wrapped in triple backticks with the label "tool_output". Use that output to guide any further tool calls or to generate a friendly response.

Guidelines:
1. Only call a function if and only if the user's request clearly maps to one of the functions below.
2. In cases of ambiguity, always choose to answer in plain text rather than risk an unnecessary function call.
3. When you decide to call a function, format your call with triple backticks labeled "tool_code" (see example below).

Example: If the user asks: "What are bezout coefficients of 12 and 21?" then you must respond with:
```tool_code
bezout(a=12, b=21)
```

The available Python methods are:

```python
def convert(amount: float, currency: str, new_currency: str) -> float:
    """Convert the currency with the latest exchange rate

    Args:
      amount: The amount of currency to convert
      currency: The currency to convert from
      new_currency: The currency to convert to
    """

def get_exchange_rate(currency: str, new_currency: str) -> float:
    """Get the latest exchange rate for the currency pair

    Args:
      currency: The currency to convert from
      new_currency: The currency to convert to
    """

def bezout(a:int , b: int) -> List[int]:
    """Compute the Bezout coefficients of a and b

    Args:
      a: The first number
      b: The second number
    """

def solve(a,b,m):
    """Solve the linear congruence equation ax=b (mod m)

    Args:
      a: The coefficient of x in the equation
      b: The right-hand side of the equation
      m: The modulus of the equation
    """
```

User: {user_message}
'''
class Agent_ft2:
    def __init__(self, system_message="", stream_output=True):
        self.messages = []
        self.stream_output = stream_output
        if system_message:
            self.messages = [{"role": "system", "content": system_message}]

    def __call__(self, message, stream=None):
        # Use parameter stream if provided, otherwise use instance default
        should_stream = self.stream_output if stream is None else stream
        
        # user message
        self.messages.append({"role": "user", "content": message})
        response = self.execute(stream=should_stream)

        # Add the assistant's response to the message history
        self.messages.append({"role": "assistant", "content": response})
        return response

    def execute(self, stream=True):
        # apply chat template 
        text = tokenizer.apply_chat_template(
            self.messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer(text, return_tensors="pt").to("cuda")

        # Only use TextStreamer if streaming is enabled
        streamer = TextStreamer(tokenizer, skip_special_tokens=True) if stream else None
        
        generated_ids = model_ft.generate(
            **model_inputs,
            # temperature=0.1,  # Control randomness (lower = more deterministic)
            # top_p=0.9,   
            streamer=streamer
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return response

def query_ft2(question):
    # initialize agent
    bot = Agent_ft()
    # get instruction prompt
    prompt = instruction_prompt.format(user_message=question)
    # get tool call - stream this first interaction
    response = bot(prompt, stream=True)         
    
    # get output of tool function
    call_response = extract_tool_call(response)

    if call_response is None:
        return 
    print(call_response)

    # don't stream the second interaction
    # call_response is added to messages with "user" role
    answer = bot(call_response, stream=False)
    return answer


question = "Solve the linear congruence equation 12x=10 (mod 34)"
answer = query_ft2(question)
print(answer)

system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user

You are a specialized assistant provided with a set of Python functions to perform specific computational tasks. Your task is to decide whether to answer the user's request with a plain text response or to call one of the provided functions.

When a function call is made, its output will be wrapped in triple backticks with the label "tool_output". Use that output to guide any further tool calls or to generate a friendly response.

Guidelines:
1. Only call a function if and only if the user's request clearly maps to one of the functions below.
2. In cases of ambiguity, always choose to answer in plain text rather than risk an unnecessary function call.
3. When you decide to call a function, format your call with triple backticks labeled "tool_code" (see example below).

Example: If the user asks: "What are bezout coefficients of 12 and 21?" then you must respond with:
```tool_code
bezout(a=12, b=21)
```

T

```tool_code
solve(a=12, b=10, m=34)
```
```tool_output
Solution : x = 15 (mod 17)
```
The solution to the linear congruence equation \(12x \equiv 10 \pmod{34}\) is \(x \equiv 15 \pmod{17}\).
