In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

In [2]:
from datasets import load_dataset

ds = load_dataset("edinburgh-dawg/mmlu-redux-2.0", "abstract_algebra")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds1 = ds["test"]
print(ds1)

Dataset({
    features: ['question', 'choices', 'answer', 'error_type', 'source', 'correct_answer', 'potential_reason'],
    num_rows: 100
})


In [9]:
from string import ascii_uppercase

base_prompt = "TESTESTTESTESTESTESTES"

for row in ds1:
    prompt = base_prompt
    
    question = row["question"]
    choices = row["choices"]

    opts_list = choices
    labeled_opts = []
    for idx, opt in enumerate(opts_list):
        label = f"({ascii_uppercase[idx]})"
        labeled_opts.append(f"{label} {opt}")
    opts_str = ", ".join(labeled_opts)

    prompt += f"Question: {question}\n"
    prompt += f"Options: {opts_str}\n"

    print(prompt)
    

TESTESTTESTESTESTESTESQuestion: Statement 1 | If T: V -> W is a linear transformation and dim(V ) < dim(W) < 1, then T must be injective. Statement 2 | Let dim(V) = n and suppose that T: V -> V is linear. If T is injective, then it is a bijection.
Options: (A) True, True, (B) False, False, (C) True, False, (D) False, True

TESTESTTESTESTESTESTESQuestion: Statement 1 | A ring homomorphism is one to one if and only if the kernel is {0}. Statement 2 | Q is an ideal in R.
Options: (A) True, True, (B) False, False, (C) True, False, (D) False, True

TESTESTTESTESTESTESTESQuestion: Determine whether the polynomial in Z[x] satisfies an Eisenstein criterion for irreducibility over Q. x^2 - 12
Options: (A) Yes, with p=2., (B) Yes, with p=3., (C) Yes, with p=5., (D) No.

TESTESTTESTESTESTESTESQuestion: Statement 1 | If H is a subgroup of G and a belongs to G then aH is a subgroup of G if and only if a is in H. Statement 2 | If H is a subgroup of G and a and b belong to G then aH = bH if and only 

In [12]:
from string import ascii_uppercase

def generate_prompt_single(
    data, 
    instruction_str, 
    format_dict,
    n_shots=5, 
    category=""
):
    """
    Generates n-shot prompts for a single category

    format_dict: A dictionary mapping standard field names to the actual keys in the dataset.
                    For example:
                        {
                            "question": "question", 
                            "options": "choices", 
                            "category": "category", 
                            "cot_content": "cot_content"
                        }
    n_shots: The number of examples (shots) to include in the prompt for each category.
    categories: Either a single category (str) or a list of categories to build prompts for.
                If None, the function will use all unique categories found in `data` using the key provided in format_dict.

    Returns:
        Returns the prompt string
    """

    prompt = ""

    prompt += instruction_str + "\n"
    
    question_key = format_dict["question"]
    options_key = format_dict["options"]
    cot_key = format_dict["cot_content"]
    
    
    

    for i in range(n_shots):
        example = data[i]
        q_text = example[question_key]
        
        opts_list = example[options_key]
        labeled_opts = []
        for idx, opt in enumerate(opts_list):
            label = f"({ascii_uppercase[idx]})"
            labeled_opts.append(f"{label} {opt}")
        opts_str = ", ".join(labeled_opts)
        
        cot_content = example.get(cot_key, "")
        
        prompt += f"Question: {q_text}\n"
        prompt += f"Options: {opts_str}\n"
        if cot_content:
            prompt += f"{cot_content}\n\n"
        else:
            prompt += "\n"
                

    return prompt



In [10]:
def get_answer(model, tokenizer, prompt, max_tokens = 1000):

    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=max_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)
    

In [8]:
entry_dict = {
    "question": "question",
    "options": "choices",
    "category": "",
    "cot_content": ""
}

In [13]:
instruct = """
The following is a multiple choice question.
Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice.
"""
d = generate_prompt_single(ds1, instruct, entry_dict, 1, "abstract_algebra")

In [5]:
from string import ascii_uppercase
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import json

def load_model(name):

    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModelForCausalLM.from_pretrained(name)

    return model, tokenizer

def generate_prompt_single(
    data, 
    instruction_str, 
    format_dict,
    n_shots=5, 
    category=""
):
    """
    Generates n-shot prompts for a single category

    format_dict: A dictionary mapping standard field names to the actual keys in the dataset.
                    For example:
                        {
                            "question": "question", 
                            "options": "choices", 
                            "category": "category", 
                            "cot_content": "cot_content"
                        }
    n_shots: The number of examples (shots) to include in the prompt for each category.
    categories: Either a single category (str) or a list of categories to build prompts for.
                If None, the function will use all unique categories found in `data` using the key provided in format_dict.

    Returns:
        Returns the prompt string
    """

    prompt = ""

    prompt += instruction_str + "\n"
    
    question_key = format_dict["question"]
    options_key = format_dict["options"]
    cot_key = format_dict["cot_content"]
    
    for i in range(n_shots):
        example = data[i]
        q_text = example[question_key]
        
        opts_list = example[options_key]
        labeled_opts = []
        for idx, opt in enumerate(opts_list):
            label = f"({ascii_uppercase[idx]})"
            labeled_opts.append(f"{label} {opt}")
        opts_str = ", ".join(labeled_opts)
        
        cot_content = example.get(cot_key, "")
        
        prompt += f"Question: {q_text}\n"
        prompt += f"Options: {opts_str}\n"
        if cot_content:
            prompt += f"{cot_content}\n\n"
        else:
            prompt += "\n"
                

    return prompt

def get_answer(model, tokenizer, prompt, max_tokens = 1000):

    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=max_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


names =  [
    "abstract_algebra",
    "anatomy",
    "astronomy",
    "business_ethics",
    "clinical_knowledge",
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_medicine",
    "college_physics",
    "computer_security",
    "conceptual_physics",
    "econometrics",
    "electrical_engineering",
    "elementary_mathematics",
    "formal_logic",
    "global_facts",
    "high_school_biology",
    "high_school_chemistry",
    "high_school_computer_science",
    "high_school_european_history",
    "high_school_geography",
    "high_school_government_and_politics",
    "high_school_macroeconomics",
    "high_school_mathematics",
    "high_school_microeconomics",
    "high_school_physics",
    "high_school_psychology",
    "high_school_statistics",
    "high_school_us_history",
    "high_school_world_history",
    "human_aging",
    "human_sexuality",
    "international_law",
    "jurisprudence",
    "logical_fallacies",
    "machine_learning",
    "management",
    "marketing",
    "medical_genetics",
    "miscellaneous",
    "moral_disputes",
    "moral_scenarios",
    "nutrition",
    "philosophy",
    "prehistory",
    "professional_accounting",
    "professional_law",
    "professional_medicine",
    "professional_psychology",
    "public_relations",
    "security_studies",
    "sociology",
    "us_foreign_policy",
    "virology",
    "world_religions"
]

instruction_string = """The following is a multiple choice question. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice. \n"""

model, tokenizer = load_model("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

redux = "edinburgh-dawg/mmlu-redux-2.0"

entry_dict = {
    "question": "question",
    "options": "choices",
    "category": "",
    "cot_content": ""
}

results = {}

for dataset_name in names:
    ds = load_dataset(redux, dataset_name)
    ds = ds["test"]

    base_prompt = instruction_string # 0 shot

    dataset_results = {}

    for idx, row in enumerate(ds):
        prompt = base_prompt
    
        question = row["question"]
        choices = row["choices"]

        opts_list = choices
        labeled_opts = [f"({ascii_uppercase[i]}) {opt}" for i, opt in enumerate(choices)]
        opts_str = ", ".join(labeled_opts)

        prompt += f"Question: {question}\n"
        prompt += f"Options: {opts_str}\n"

        answer = get_answer(prompt)

        dataset_results[idx] = {
            "question": question,
            "answer": answer
        }
    
    results[dataset_name] = dataset_results

with open("results.json", "w") as f:
    json.dump(results, f, indent = 4)


SyntaxError: unterminated string literal (detected at line 151) (1142229269.py, line 151)