In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

from eval import evaluator, Task
from eval.eval_configs import NumericConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu'
print(device)

mps


In [3]:
model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen3-0.6B', device_map='auto')
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B', padding_side='left')

In [4]:
dataset = load_dataset('openai/gsm8k', 'main', split='test[:100]')
eval_config = NumericConfig(tolerance=0)
user_template = "{question}"
assistant_template = "{answer}"
system_prompt = "Solve the given math problem by thinking step by step."

gsm8k_task = Task('gsm8k', dataset, user_template, assistant_template, system_prompt, eval_config)

Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 1274199.51 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 919748.46 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 28439.82 examples/s]


In [5]:
result = gsm8k_task.evaluate(model, tokenizer, batch_size=8, progress=True)

100%|██████████| 13/13 [09:03<00:00, 41.84s/it]


In [6]:
result

EvaluationResult(NUMERICAL, accuracy=0.0800, answered_rate=1.0000, correct=8, answered=100, total=100, n=100)

In [7]:
print(result.predictions[0])

To find out how much Janet makes every day at the farmers' market, we need to break down the problem into parts and calculate each component step by step.

---

### **Step 1: Total eggs laid per day**
Janet’s ducks lay **16 eggs per day**.

---

### **Step 2: Eggs eaten for breakfast**
She eats **3 eggs** for breakfast every morning.

So, **3 eggs** are eaten each day.

---

### **Step 3: Eggs used for baking muffins**
She bakes **4 eggs** for her friends every day.

So, **4 eggs** are used for baking muffins.

---

### **Step 4: Eggs left for the market**
The remaining eggs are:
$$
16 \text{ (laid per day)} - 3 \text{ (eaten for breakfast)} - 4 \text{ (baked for muffins)} = 9 \text{ eggs}
$$

---

### **Step 5: Selling eggs at the market**
She sells the remaining **9 eggs** at **$2 per egg**.

So, the total money made at the market is:
$$
9 \times 2 = 18 \text{ dollars}
$$

---

### ✅ **Final Answer:**
$$
\boxed{18} \text{ dollars}
$$


In [9]:
print(result.references[0])

Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


In [10]:
print(dataset[0]['question'])

Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?


In [2]:
from huggingface_hub import login

# Login to Hugging Face to access gated models
# Get your token from: https://huggingface.co/settings/tokens
# Ensure your token has 'Read' permissions (or 'Write' if you plan to push models)
hf_token = "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"  # Replace with your actual token
login(token=hf_token)

In [17]:
# Test with Non-Chat Model (Gemma 3 270M)
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from eval import Task
from eval.eval_configs import NumericConfig

# Load model and tokenizer
model_name = "google/gemma-3-270m"
print(f"Loading {model_name}...")
try:
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
    print(f"Failed to load {model_name}: {e}")
    # Fallback to a known small model if the requested one fails or doesn't exist in this environment
    print("Falling back to google/gemma-2-2b")
    model_name = "google/gemma-2-2b"
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load dataset
dataset = load_dataset('openai/gsm8k', 'main', split='test[:10]')

# Define templates for base model (few-shot style or direct completion)
# Since it's a base model, we might want to format it as a completion task
user_template = "Question: {question}\nAnswer:"
assistant_template = "{answer}"
system_prompt = None # Base models typically don't use system prompts in the same way

# Configure evaluation
eval_config = NumericConfig()

# Create Task with is_chat_task=False
task = Task(
    task_name="gsm8k_base",
    dataset=dataset,
    user_template=user_template,
    assistant_template=assistant_template,
    system_prompt=system_prompt,
    eval_config=eval_config,
    is_chat_task=False
)

print(f"Created task: {task}")

Loading google/gemma-3-270m...
Created task: Task(task_name=gsm8k_base, num_samples=10)
Created task: Task(task_name=gsm8k_base, num_samples=10)


In [18]:
# Run evaluation
print("Starting evaluation...")
result = task.evaluate(model, tokenizer, batch_size=4, max_new_tokens=128, progress=True)

print("\nEvaluation Results:")
print(result)

# Inspect a few predictions
print("\nSample Prediction:")
print(f"Input: {task.dataset[0]['X']}")
print(f"Prediction: {result.predictions[0]}")
print(f"Reference: {result.references[0]}")

Starting evaluation...


100%|██████████| 3/3 [00:27<00:00,  9.30s/it]


Evaluation Results:
EvaluationResult(NUMERICAL, accuracy=0.1000, answered_rate=1.0000, correct=1, answered=10, total=10, n=10)

Sample Prediction:
Input: Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Answer:
Prediction: $ 3.66

Solution:

In the case of a rectangle, she would need two equal parts of chicken meat, and there is a very low ratio of one to two.

Let d be the number of eggs needed to make a rectangle, with each part of chicken meat having a length of d/2, and a width of d. Then, the length of the chicken meat is d.

From the problem, we have two choices to answer d: 1) 16 eggs, or d + 4

or 15 eggs, or 4d.

Since, the chickens were
Reference: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 eve




In [None]:
idx = 1
print("\nSample Prediction:")
print(f"Input: {task.dataset[idx]['X']}")
print(f"Prediction: {result.predictions[idx]}")
print(f"Reference: {result.references[idx]}")

{'question': 'A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?',
 'answer': 'It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric\n#### 3'}

In [12]:
result.predictions


['$ 0.16.\n\nThe cost of producing one pound of beans is 1564.00.a.) Find the cost of producing a pound of beans.b.) Find the cost of producing a pound of beans in the first year, if the price of beans increases by 20%. 3\n\nAnswer: b) $ 0.28.\n\nIn a class of 107 students, 84 are on the honor roll and 36 are not on the honor roll. If one student gets an extra 10% on the first day he or she',
 '200  [Blue + White = 100]\n\nI am doing homework in math class and this is a tricky problem, I have a lot of things to do.  So the problem is going to be "The number of bolts"\n\nI have a 1/4" blue thread, 2 bolts of white.  Now if you have 200 bolts it would be 200/4", which is the answer,  1/2" would be 200/1/4", which is 100/2" which is 60.',
 '$75,000Answer:\n\n\nAnswer: Josh should put in 75% of the value for profit.  50% of the value for profit is the difference of $100,000 and $50,000.\n\nAnswer: This works with a lot of money if the house is priced fairly and lots of credit has been give

In [15]:
dataset[0]

{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}

In [None]:
# Winogrande Evaluation
from eval.eval_configs import MultipleChoiceConfig

# Load Winogrande dataset
print("Loading Winogrande dataset...")
winogrande_dataset = load_dataset('winogrande', 'winogrande_xl', split='validation[:10]')

# Define templates
# Winogrande provides a sentence with a blank "_" and two options.
user_template = "Complete the following sentence by choosing the correct option:\n{sentence}\nOptions:\n1. {option1}\n2. {option2}\nAnswer:"
assistant_template = "{answer}"
system_prompt = "You are a helpful assistant. Choose the correct option (1 or 2) to complete the sentence."

# Configure evaluation for choices '1' and '2'
eval_config = MultipleChoiceConfig(choices=['1', '2'])

# Create Task
winogrande_task = Task(
    task_name="winogrande",
    dataset=winogrande_dataset,
    user_template=user_template,
    assistant_template=assistant_template,
    system_prompt=system_prompt,
    eval_config=eval_config,
    is_chat_task=True # Assuming we are using the chat model loaded earlier
)

print(f"Created task: {winogrande_task}")

# Run evaluation
print("Starting Winogrande evaluation...")
# Re-using the 'model' and 'tokenizer' from previous cells (Qwen or Gemma)
# If you want to ensure a specific model is used, you might need to reload it, 
# but here we assume the user wants to use the currently loaded model.
result = winogrande_task.evaluate(model, tokenizer, batch_size=4, max_new_tokens=10, progress=True)

print("\nEvaluation Results:")
print(result)

# Inspect a prediction
print("\nSample Prediction:")
print(f"Input: {winogrande_task.dataset[0]['X']}")
print(f"Prediction: {result.predictions[0]}")
print(f"Reference: {result.references[0]}")