In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Define the system prompt that instructs the model to output its reasoning and final answer.
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

In [2]:
base_model_name = "Qwen/Qwen2-1.5B-Instruct"
peft_model_path = "Neuranest/Qwen2-1.5B-Instruct-Math"

# Load the tokenizer from the base model.
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)
tokenizer.padding_side = "left"
# Ensure that a pad token is defined.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the base model.
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True,)

# Load the fine-tuned model using PEFT.
model =PeftModel.from_pretrained(base_model, peft_model_path, revision ='73da8570fbaa7503cf80c5ceb4ba9fce8e4f7c0b')

# Set the model to evaluation mode and move it to the appropriate device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_model.safetensors:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
                (ba

In [3]:
prompt = tokenizer.apply_chat_template(
    [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': '''I have 5 apples and you have 2. how many total apples'''}
    ],
    tokenize=False,             # Return a raw string rather than token IDs.
    add_generation_prompt=True   # Append a cue indicating that generation should start.
)

# Tokenize the prompt for the model.
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True,)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate a response from the model.
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=200,   # Limit the generated response length.
        do_sample=True,       # Enable sampling for diverse output.
        temperature=0.3,      # Controls randomness (lower is more deterministic).
        top_p=0.95             # Controls diversity (higher allows more diversity).
    )

# Decode the generated tokens into a human-readable string.
response = tokenizer.decode(outputs[0], skip_special_tokens=False)


print("\nGenerated Response:")
print(response)


Generated Response:
<|im_start|>system

Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
<|im_end|>
<|im_start|>user
I have 5 apples and you have 2. how many total apples<|im_end|>
<|im_start|>assistant
Reasoning: To determine the total number of apples, we add the number of apples you have (2) to the number of apples I have (5).
Answer: 7
</answer><|im_end|>


In [4]:
from datasets import load_dataset

In [5]:

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()
def get_gsm8k_questions(split = "train") :
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt':tokenizer.apply_chat_template(
            [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ], tokenize = False, add_generation_prompt = True),

        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

In [8]:
# prompt = tokenizer.apply_chat_template(
#     [
#         {'role': 'system', 'content': SYSTEM_PROMPT},
#         {'role': 'user', 'content': dataset[]['question']}
#     ],
#     tokenize=False,             # Return a raw string rather than token IDs.
#     add_generation_prompt=True   # Append a cue indicating that generation should start.
# )

# Tokenize the prompt for the model.
i=14
inputs = tokenizer(dataset[i]['prompt'], return_tensors="pt", padding=True, truncation=True,)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate a response from the model.
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        use_cache=True,
        max_new_tokens=200,   # Limit the generated response length.
        do_sample=True,       # Enable sampling for diverse output.
        temperature=0.7,      # Controls randomness (lower is more deterministic).
        top_p=0.95             # Controls diversity (higher allows more diversity).
    )

# Decode the generated tokens into a human-readable string.

response = tokenizer.decode(outputs[0], skip_special_tokens=False)


print("\nGenerated Response:")
print(response)
print("real answer ", dataset[i]['answer'])


Generated Response:
<|im_start|>system

Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
<|im_end|>
<|im_start|>user
Joy can read 8 pages of a book in 20 minutes. How many hours will it take her to read 120 pages?<|im_end|>
<|im_start|>assistant
<reasoning>Joy's reading rate is determined by the number of pages she reads per minute, which is 8/20 = 0.4 pages per minute. To find out how long it takes her to read 120 pages at this rate, we need to divide the total number of pages by her reading rate: 120 / 0.4 = 300 minutes. Since there are 60 minutes in an hour, we convert this time into hours: 300 / 60 = 5 hours.</reasoning>
<answer>It will take Joy 5 hours to read 120 pages.</answer><|im_end|>
real answer  5
