In [None]:
# !pip install datasets
from datasets import load_dataset, Dataset



In [3]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions(split='test')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [4]:
dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 1319
})

In [12]:
for i in range(5):
  print(dataset['question'][i], dataset['answer'][i])


Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? 72
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn? 10
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet? 5
Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read? 42
James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year? 624


In [None]:
#%%
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import logging
import os, glob

from accelerate import infer_auto_device_map

#%%
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def load_model(adapter_path, base_model=None, use_quantization=False):
    """Load a QLoRA fine-tuned model from Hugging Face"""

    # Get base model name from adapter config if not provided
    # Look for model in local directory
    # if glob.glob(f"{adapter_path}") and glob.glob(f"{adapter_path}/adapter_config.json") == []:
    #     adapter_path = glob.glob(f"{adapter_path}/*/*/adapter_config.json")[0].split("/adapter_config.json")[0]

    peft_config = PeftConfig.from_pretrained(adapter_path)
    base_model = base_model or peft_config.base_model_name_or_path
    logger.info(f"Using base model: {base_model}")

    # Load base model with or without quantization
    if use_quantization:
        logger.info("Loading base model with quantization...")
        # Set up 4-bit quantization
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True
        )

        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            quantization_config=quantization_config,
            device_map="auto",
            trust_remote_code=True,
        )


    else:
        # Load base model without quantization
        logger.info("Loading base model without quantization...")
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            device_map="auto",
            trust_remote_code=True
        )

    # Load and apply adapter weights
    logger.info("Applying LoRA adapters...")
    model = PeftModel.from_pretrained(model, adapter_path)


    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def load_model_from_hub(model_name):
    """Load a model from Hugging Face Hub"""
    # Load the model
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Set padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


def generate(model, tokenizer, prompt, max_new_tokens=1024, temperature=1.0):
    """Generate text using the loaded model"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate with sampling
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature if temperature>0.0 else None,
            top_p=0.9 if temperature>0.0 else None,
            top_k=20 if temperature>0.0 else None,
            do_sample=temperature>0.0
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def generate_batch(model, tokenizer, prompt, max_new_tokens=512, temperature=0.7):
    """
    Generate text using the loaded model
    Takes input str after chat template has been applied
    """
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, padding_side='right').to(model.device)

    # Generate with sampling
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature if temperature>0.0 else None,
            top_p=0.9 if temperature>0.0 else None,
            top_k=20 if temperature>0.0 else None,
            do_sample=temperature>0.0,
        )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

import sys
sys.path.append('/cs/student/msc/ml/2024/ycheah/projects/sos/stream-of-search')


In [None]:
context_len = 512
temperature = 0.7
from tqdm import tqdm

adapter="Qwen/Qwen2.5-0.5B-Instruct"
batch_size=128
model, tokenizer = load_model_from_hub(adapter)
model.eval()
model.cuda()
tokenizer.pad_token = tokenizer.eos_token

def message_template(example_question):
    return [{ "content": f"{example_question}.\nConclude with the final result in EXACTLY this format:\n```\nSOLUTION: YES/NO\ \nRESULT: final_value\n```\nThe final_value should be the numerical answer. For example, '611'", "role": "user" }]

output_texts_concat = []

data = dataset.map(lambda x: {
    "test_prompt": message_template(x['question'])
})

data = data.select(range(100))

def eval_dataset(data, column_name, verified_column, discrepancies_column):    # Generate completions for this batch
    for i, data_batch in tqdm(enumerate(data.iter(batch_size=batch_size)), total=len(data)//batch_size):
        chat_inputs = tokenizer.apply_chat_template(data_batch["test_prompt"], return_tensors="pt", padding=True, truncation=True, max_length=context_len, return_length=True, tokenize=False)
        outputs = generate_batch(model, tokenizer, chat_inputs, max_new_tokens=context_len, temperature=temperature)
        output_texts_concat.extend(outputs)
    return data

column_name = f"completions"
verified_column = f"verified"
discrepancies_column = f"discrepancies"
data = eval_dataset(data, column_name, verified_column, discrepancies_column)
data = data.add_column(column_name, output_texts_concat)

# Calculate score
score = data[verified_column].count(True) / len(data) * 100
print(f"score: {score:.2f}%")

results= {}
# Store score and trajectories
results['scores'] = score
results['trajectories'] = []

# Create trajectory data using the correct column names for each key
for i in range(len(data)):
    results['trajectories'].append({
        'completions': data[column_name][i],
        'verified': data[verified_column][i],
        'discrepancies': data[discrepancies_column][i]
    })

import json, os
savepath = f"./results/ood/{adapter}/gsm8k.json"
os.makedirs(os.path.dirname(savepath), exist_ok=True)
with open(savepath, 'w') as f:
    json.dump(results, f, indent=4)

1it [01:47, 107.43s/it]


KeyError: "Column verified not in the dataset. Current columns in the dataset: ['question', 'answer', 'prompt', 'test_prompt', 'completions']"