In [5]:
!pip install groq python-dotenv numpy tqdm datasets



In [6]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

In [37]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama3-8b-8192"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            model=model,
            temperature = 0.3, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### 응답 잘 나오는지 확인해보기

In [7]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello! I'm here to help you with any math problems you might have. Whether it's algebra, geometry, calculus, or anything in between, I'm here to assist you. What kind of math problem are you working on?


#### GSM8K 데이터셋 확인해보기

In [8]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


#### Util 함수들
- extract_final_answer: LLM의 응답을 parse하여 최종 결과만 추출 (정답과 비교하기 위해)
- run_benchmark_test: 벤치마크 테스트
- save_final_result: 결과물 제출을 위한 함수

In [9]:
### 수정해도 됩니다!
def extract_final_answer(response: str):
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,]+)\b|([0-9,]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)
    results = [match.group(1) if match.group(1) else match.group(2).replace(",", "") for match in matches]

    if len(results) == 0:
        additional_regex = r"\$?([0-9,]+)"
        additional_matches = re.finditer(additional_regex, response, re.MULTILINE)
        results.extend([match.group(1).replace(",", "") for match in additional_matches])

    results = [r for r in results if r.strip() != ""]

    return results[-1] if results else None

In [10]:
### 수정해도 됩니다!
def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama3-8b-8192",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    correct = 0
    total   = 0
    results = []

    for i in tqdm(range(min(num_samples, len(dataset)))):
        question = dataset[i]["question"]
        correct_answer = float(re.findall(r'\d+(?:\.\d+)?', dataset[i]["answer"].split('####')[-1])[0])

        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        if response:
            if VERBOSE:
                print("="*50)
                print(response)
                print("="*50)
            predicted_answer = extract_final_answer(response)

            if isinstance(predicted_answer, str):
                predicted_answer = float(predicted_answer.replace(",", ""))
            
            diff = abs(predicted_answer - correct_answer)
            is_correct = diff < 1e-5 if predicted_answer is not None else False
            
            if is_correct:
                correct += 1
            total += 1
            
            results.append({
                'question': question,
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'response': response,
                'correct': is_correct
            })

            if (i + 1) % 5 == 0:
                current_acc = correct/total if total > 0 else 0
                print(f"Progress: [{i+1}/{num_samples}]")
                print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct/total if total > 0 else 0

In [11]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"
    
    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [11]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = "Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [12]:
### 어떤 방식으로 저장되는지 확인해보세요!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|█████     | 5/10 [00:01<00:01,  3.74it/s]

Progress: [5/10]
Current Acc.: [20.00%]


100%|██████████| 10/10 [00:02<00:00,  3.59it/s]

Progress: [10/10]
Current Acc.: [30.00%]





In [13]:
# TODO: 0 shot, 3 shot, 5 shot direct prompting을 통해 벤치마크 테스트를 한 후, 각각 direct_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 direct_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    print(f"\n=== Running Direct Prompting {shot}-shot ===")
    prompt = construct_direct_prompt(shot)
    results, acc = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=prompt,
        VERBOSE=False,
        num_samples=50
    )
    filename = f"direct_prompting_{shot}.txt"
    save_final_result(results, acc, filename)
    print(f"Saved results to {filename} with accuracy {acc:.2%}")


=== Running Direct Prompting 0-shot ===


 10%|█         | 5/50 [00:01<00:10,  4.16it/s]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:02<00:10,  3.70it/s]

Progress: [10/50]
Current Acc.: [40.00%]


 30%|███       | 15/50 [00:03<00:10,  3.44it/s]

Progress: [15/50]
Current Acc.: [26.67%]


 40%|████      | 20/50 [00:05<00:08,  3.73it/s]

Progress: [20/50]
Current Acc.: [30.00%]


 50%|█████     | 25/50 [00:06<00:06,  4.14it/s]

Progress: [25/50]
Current Acc.: [28.00%]


 60%|██████    | 30/50 [00:08<00:07,  2.73it/s]

Progress: [30/50]
Current Acc.: [26.67%]


 70%|███████   | 35/50 [00:11<00:10,  1.50it/s]

Progress: [35/50]
Current Acc.: [22.86%]


 80%|████████  | 40/50 [00:13<00:03,  2.85it/s]

Progress: [40/50]
Current Acc.: [20.00%]


 90%|█████████ | 45/50 [00:14<00:01,  3.73it/s]

Progress: [45/50]
Current Acc.: [22.22%]


100%|██████████| 50/50 [00:15<00:00,  3.14it/s]


Progress: [50/50]
Current Acc.: [24.00%]
Saved results to direct_prompting_0.txt with accuracy 24.00%

=== Running Direct Prompting 3-shot ===


 10%|█         | 5/50 [00:01<00:12,  3.62it/s]

Progress: [5/50]
Current Acc.: [20.00%]


 20%|██        | 10/50 [00:02<00:10,  3.64it/s]

Progress: [10/50]
Current Acc.: [30.00%]


 30%|███       | 15/50 [00:04<00:10,  3.35it/s]

Progress: [15/50]
Current Acc.: [26.67%]


 40%|████      | 20/50 [00:05<00:08,  3.42it/s]

Progress: [20/50]
Current Acc.: [30.00%]


 50%|█████     | 25/50 [00:07<00:09,  2.66it/s]

Progress: [25/50]
Current Acc.: [28.00%]


 60%|██████    | 30/50 [00:09<00:06,  3.22it/s]

Progress: [30/50]
Current Acc.: [26.67%]


 70%|███████   | 35/50 [00:10<00:04,  3.47it/s]

Progress: [35/50]
Current Acc.: [22.86%]


 80%|████████  | 40/50 [00:12<00:02,  3.35it/s]

Progress: [40/50]
Current Acc.: [22.50%]


 90%|█████████ | 45/50 [00:13<00:01,  3.68it/s]

Progress: [45/50]
Current Acc.: [22.22%]


100%|██████████| 50/50 [00:17<00:00,  2.93it/s]


Progress: [50/50]
Current Acc.: [24.00%]
Saved results to direct_prompting_3.txt with accuracy 24.00%

=== Running Direct Prompting 5-shot ===


 10%|█         | 5/50 [00:03<00:32,  1.40it/s]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:08<00:38,  1.03it/s]

Progress: [10/50]
Current Acc.: [20.00%]


 30%|███       | 15/50 [00:12<00:26,  1.32it/s]

Progress: [15/50]
Current Acc.: [13.33%]


 40%|████      | 20/50 [00:17<00:29,  1.01it/s]

Progress: [20/50]
Current Acc.: [15.00%]


 50%|█████     | 25/50 [00:21<00:19,  1.29it/s]

Progress: [25/50]
Current Acc.: [20.00%]


 60%|██████    | 30/50 [00:26<00:19,  1.01it/s]

Progress: [30/50]
Current Acc.: [23.33%]


 70%|███████   | 35/50 [00:30<00:12,  1.21it/s]

Progress: [35/50]
Current Acc.: [20.00%]


 80%|████████  | 40/50 [00:35<00:10,  1.00s/it]

Progress: [40/50]
Current Acc.: [20.00%]


 90%|█████████ | 45/50 [00:39<00:04,  1.23it/s]

Progress: [45/50]
Current Acc.: [17.78%]


100%|██████████| 50/50 [00:44<00:00,  1.12it/s]

Progress: [50/50]
Current Acc.: [18.00%]
Saved results to direct_prompting_5.txt with accuracy 18.00%





### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
```

[Answer] 아래의 정답을 도출하는 과정을 예시로 달아주면 CoT의 few shot이 되겠죠?

In [31]:
def construct_CoT_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    prompt = "Solve the following math problems step-by-step.\nShow your reasoning first. On the final line, give the final answer after a tag, '####'.\n" 
    #TODO: 프롬프트를 작성해주세요! #step-by-step reasoning 유도하는 instruction

    for i in range(num_examples):
        #TODO: CoT example을 만들어주세요!
        cur_question = train_dataset["question"][i]
        full_answer = train_dataset["answer"][i].strip()
        cur_rationale = full_answer.split("####")[0].strip()
        cur_answer = full_answer.split("####")[-1].strip()


        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Rationale: {cur_rationale}\n"
        prompt += f"#### {cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nLet's think step by step.\n####"
    
    return prompt

In [32]:
# TODO: 0 shot, 3 shot, 5 shot CoT prompting을 통해 벤치마크 테스트를 한 후, 각각 CoT_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 CoT_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    print(f"\n=== Running CoT Prompting {shot}-shot ===")
    
    prompt = construct_CoT_prompt(shot)  
    results, acc = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=prompt,
        VERBOSE=VERBOSE,
        num_samples=50  
    )

    filename = f"CoT_prompting_{shot}.txt"
    save_final_result(results, acc, filename)
    print(f"Saved results to {filename} with accuracy {acc:.2%}")


=== Running CoT Prompting 0-shot ===


 10%|█         | 5/50 [00:03<00:29,  1.54it/s]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:06<00:26,  1.53it/s]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [00:09<00:22,  1.57it/s]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [00:13<00:24,  1.22it/s]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [00:16<00:15,  1.62it/s]

Progress: [25/50]
Current Acc.: [60.00%]


 60%|██████    | 30/50 [00:19<00:12,  1.66it/s]

Progress: [30/50]
Current Acc.: [60.00%]


 70%|███████   | 35/50 [00:22<00:09,  1.63it/s]

Progress: [35/50]
Current Acc.: [65.71%]


 80%|████████  | 40/50 [00:26<00:08,  1.23it/s]

Progress: [40/50]
Current Acc.: [62.50%]


 90%|█████████ | 45/50 [00:30<00:04,  1.23it/s]

Progress: [45/50]
Current Acc.: [62.22%]


100%|██████████| 50/50 [00:34<00:00,  1.47it/s]


Progress: [50/50]
Current Acc.: [64.00%]
Saved results to CoT_prompting_0.txt with accuracy 64.00%

=== Running CoT Prompting 3-shot ===


 10%|█         | 5/50 [00:04<00:35,  1.28it/s]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:07<00:28,  1.40it/s]

Progress: [10/50]
Current Acc.: [50.00%]


 30%|███       | 15/50 [00:10<00:22,  1.52it/s]

Progress: [15/50]
Current Acc.: [46.67%]


 40%|████      | 20/50 [00:14<00:24,  1.24it/s]

Progress: [20/50]
Current Acc.: [45.00%]


 50%|█████     | 25/50 [00:18<00:20,  1.24it/s]

Progress: [25/50]
Current Acc.: [48.00%]


 60%|██████    | 30/50 [00:22<00:14,  1.38it/s]

Progress: [30/50]
Current Acc.: [53.33%]


 70%|███████   | 35/50 [00:25<00:09,  1.57it/s]

Progress: [35/50]
Current Acc.: [60.00%]


 80%|████████  | 40/50 [00:29<00:06,  1.48it/s]

Progress: [40/50]
Current Acc.: [60.00%]


 90%|█████████ | 45/50 [00:32<00:03,  1.56it/s]

Progress: [45/50]
Current Acc.: [62.22%]


100%|██████████| 50/50 [00:36<00:00,  1.36it/s]


Progress: [50/50]
Current Acc.: [64.00%]
Saved results to CoT_prompting_3.txt with accuracy 64.00%

=== Running CoT Prompting 5-shot ===


 10%|█         | 5/50 [00:03<00:32,  1.38it/s]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:07<00:29,  1.37it/s]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [00:11<00:25,  1.39it/s]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [00:14<00:22,  1.32it/s]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [00:18<00:19,  1.30it/s]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [00:22<00:14,  1.40it/s]

Progress: [30/50]
Current Acc.: [66.67%]


 70%|███████   | 35/50 [00:25<00:10,  1.41it/s]

Progress: [35/50]
Current Acc.: [68.57%]


 80%|████████  | 40/50 [00:28<00:06,  1.50it/s]

Progress: [40/50]
Current Acc.: [62.50%]


 90%|█████████ | 45/50 [00:32<00:03,  1.25it/s]

Progress: [45/50]
Current Acc.: [64.44%]


100%|██████████| 50/50 [00:36<00:00,  1.36it/s]

Progress: [50/50]
Current Acc.: [66.00%]
Saved results to CoT_prompting_5.txt with accuracy 66.00%





### Construct your prompt!!

목표: 본인만의 프롬프트를 통해 정답률을 더 끌어올려보기!
- gsm8k의 train 데이터셋에서 예시를 가져온 다음 (자유롭게!)
- 그 예시들에 대한 풀이 과정을 만들어주세요!
- 모든 것들이 자유입니다! Direct Prompting, CoT Prompting을 한 결과보다 정답률만 높으면 돼요.

In [16]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama3-8b-8192"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            model=model,
            temperature = 0.7, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

In [17]:
### 자유롭게 수정해도 됩니다! 완전히 새로 함수를 만들어도 돼요.
import re
from collections import Counter
from typing import List, Tuple
from tqdm import tqdm
import numpy as np


#정답 추출
hash_re = re.compile(r"####\s*(-?[\d,]+(?:\.\d+)?)")
therefore_re = re.compile(r"Therefore[^0-9\-]*(-?[\d,]+(?:\.\d+)?)", re.IGNORECASE)
main_re = re.compile(r"(?:Answer:|Model response:)?\s*\$?(-?[\d,]+(?:\.\d+)?)(?=\s*(?:meters|cups|miles|minutes)?\b)")
fallback_re = re.compile(r"\$?(-?[\d,]+(?:\.\d+)?)")

def extract_and_normalize_answer(response: str) -> str | None:
    for pattern in [hash_re, therefore_re, main_re, fallback_re]:
        match = pattern.findall(response)
        if match:
            last = match[-1]
            if isinstance(last, tuple):
                last = next((x for x in last if x), None)
            return last.replace(",", "") if last else None
    return None


#프롬프트 생성 함수
def construct_my_prompt(example_list: List[str], num_examples: int) -> str:

    
    prompt = """You are an expert mathematician and problem solver. Your task is to solve math word problems with extremely high accuracy.

INSTRUCTIONS:
1. Read the problem carefully and identify ALL given information
2. Break down the problem into clear, logical steps
3. Show ALL calculations explicitly with intermediate results
4. Double-check your work before giving the final answer
5. Always end with "#### [number]" where [number] is your final answer
6. **MANDATORY VERIFICATION**: After solving, you MUST verify your answer by:
   - Recalculating key steps
   - Checking if the answer makes logical sense
   - Ensuring all problem conditions are met
   - Confirming units are correct
7. Only provide final answer after verification


SOLUTION FORMAT:
- Start with "Let me solve this step by step:"
- Number each step clearly (1), 2), 3), etc.)
- Show intermediate calculations with clear labels
- End with "Therefore, [conclusion]"
- "Let me verify my answer:" followed by verification
- Final line: "#### [answer]"

EXAMPLES:"""

    for i, example in enumerate(example_list[:num_examples]):
        prompt += f"\n\n{example}\n"
    
    prompt += """

Now solve the following problem:

Question:
{question}

Let me solve this step by step:
Let me verify my answer:

#### """
    
    return prompt

#평가 함수
def run_prompt_evaluation(
    dataset: List[dict],
    example_list: List[str],
    model_func,
    num_shot: int = 5,
    num_generations: int = 5,  
    num_samples: int = 50,
    VERBOSE: bool = True
) -> Tuple[List[Tuple[str, str, str]], float]:

    prompt_template = construct_my_prompt(example_list, num_shot)
    results = []
    correct = 0
    pseudo_labels = []

    for i in tqdm(range(min(num_samples, len(dataset))), ncols=90):
        question = dataset[i]["question"]
        gt_answer = dataset[i]["answer"].split("####")[-1].strip().replace(",", "")
        prompt_filled = prompt_template.replace("{question}", question.strip())

        # 더 많은 생성으로 Self-Consistency 강화
        generations = [model_func(prompt_filled) for _ in range(num_generations)]
        
        #정답 추출
        extracted = []
        answer_to_rationale = {}  # dict: answer → rationale

        for gen in generations:
            answer = extract_and_normalize_answer(gen)
            if answer is not None:
                extracted.append(answer)
                answer_to_rationale[answer] = gen.strip()

        if not extracted:
            pred = None
        else:
            count = Counter(extracted)
            most_common = count.most_common()
            max_freq = most_common[0][1]
            candidates = [val for val, freq in most_common if freq == max_freq]
            pred = next((val for val in reversed(extracted) if val in candidates), extracted[-1])

        results.append((question, gt_answer, pred))

        try:
            if pred is not None and abs(float(pred) - float(gt_answer)) < 1e-3:
                correct += 1

                # pseudo label 저장 (정답 맞춘 CoT만)
                rationale = answer_to_rationale.get(pred)
                if rationale:
                    pseudo_labels.append({
                        "question": question,
                        "rationale": rationale,
                        "answer": pred
                    })

        except ValueError:
            pass

        if VERBOSE and (i + 1) % 5 == 0:
            current_acc = correct / (i + 1)
            print(f"Progress: [{i+1}/{num_samples}]")
            print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct / num_samples

In [None]:
example_list = [

"""Question:
Sophie works 40 hours a week at a rate of $15 per hour. Any overtime is paid at 1.5x the regular rate. If she worked 48 hours in a week, what is her total pay?

Let me solve this step by step:
1) Regular hours = 40, Overtime hours = 48 - 40 = 8
2) Regular rate = $15/hour, Overtime rate = $15 × 1.5 = $22.5/hour
3) Regular pay = 40 × $15 = $600
4) Overtime pay = 8 × $22.5 = $180
5) Total pay = $600 + $180 = $780

Let me verify my answer:
- Regular hours: 40 ✓, Overtime: 8 ✓
- Regular rate: $15, Overtime: $22.5 ✓
- Total = $780 ✓

#### 780""",


"""Question:
A train travels 120 miles in 2 hours. If it continues at the same speed, how many miles will it travel in 5 hours?

Let me solve this step by step:
1) Distance traveled = 120 miles
2) Time taken = 2 hours
3) Speed = 120 ÷ 2 = 60 mph
4) Time to travel = 5 hours
5) Distance = 60 × 5 = 300 miles

Let me verify my answer:
- Speed calculation: 120 ÷ 2 = 60 mph ✓
- Distance calculation: 60 × 5 = 300 miles ✓
- Units are consistent (miles) ✓
- Answer makes sense (longer time = more distance) ✓

#### 300""",


"""Question:
There are 20 students in a class. 25% of them bring their lunch from home. How many students bring lunch from home?

Let me solve this step by step:
1) Total students = 20
2) 25% of 20 = 0.25 × 20 = 5

Let me verify my answer:
- Percentage: 25% = 0.25 ✓
- 0.25 × 20 = 5 ✓

#### 5""",


"""Question:
A rectangle has a length of 12 meters and a width of 8 meters. What is the area of the rectangle in square meters?

Let me solve this step by step:
1) Area of rectangle = length × width
2) Length = 12 meters, Width = 8 meters
3) Area = 12 × 8 = 96 square meters

Let me verify my answer:
- Area formula: length × width ✓
- Calculation: 12 × 8 = 96 ✓
- Units: meters × meters = square meters ✓
- Answer is reasonable for given dimensions ✓

#### 96""",


"""Question:
A jacket is on sale for 20% off. If the original price is $80, what is the sale price?

Let me solve this step by step:
1) Discount = 20% of $80 = 0.2 × 80 = $16
2) Sale price = 80 - 16 = $64

Let me verify my answer:
- 20% of 80 = 16 ✓
- Final price = 80 - 16 = 64 ✓

#### 64""",


"""Question:
A store sells shirts for $15 each and pants for $25 each. If a customer buys 3 shirts and 2 pairs of pants, how much does the customer pay in total?

Let me solve this step by step:
1) Cost of shirts = 3 × $15 = $45
2) Cost of pants = 2 × $25 = $50
3) Total cost = $45 + $50 = $95

Let me verify my answer:
- Shirt cost: 3 × $15 = $45 ✓
- Pant cost: 2 × $25 = $50 ✓
- Total: $45 + $50 = $95 ✓
- All calculations are correct ✓
- Units are consistent (dollars) ✓

#### 95""",
]

In [18]:
# TODO: 만든 0 shot, 3 shot, 5 shot example과 프롬프트를 통해 벤치마크 테스트를 한 후, 각각 My_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 My_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    print(f"\n=== Running My Prompt - {shot}-shot ===")
    results, acc = run_prompt_evaluation(
        dataset=gsm8k_test,
        example_list=example_list,
        model_func=generate_response_using_Llama,
        num_shot=shot,
        num_generations=5,
        num_samples=50,
        VERBOSE=True
    )

    results_dict = [
        {"question": q, "correct_answer": gt, "predicted_answer": pred, "correct": (
            False if pred is None or gt is None else
            (
                abs(float(pred.strip().replace(",", "")) - float(gt.strip().replace(",", ""))) < 1e-3
                if pred.strip() != "" and gt.strip() != "" else False
            )
        )}
        for q, gt, pred in results
    ]
    
    filename = f"My_prompting_{shot}.txt"
    save_final_result(results_dict, acc, filename)
    print(f"Saved to {filename} | Accuracy: {acc:.2%}")


=== Running My Prompt - 0-shot ===


 10%|█████▍                                                | 5/50 [00:24<03:40,  4.90s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██████████▌                                          | 10/50 [00:46<02:55,  4.38s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|███████████████▉                                     | 15/50 [01:09<02:42,  4.63s/it]

Progress: [15/50]
Current Acc.: [80.00%]


 40%|█████████████████████▏                               | 20/50 [01:34<02:26,  4.87s/it]

Progress: [20/50]
Current Acc.: [75.00%]


 50%|██████████████████████████▌                          | 25/50 [01:56<01:46,  4.26s/it]

Progress: [25/50]
Current Acc.: [76.00%]


 60%|███████████████████████████████▊                     | 30/50 [02:12<01:07,  3.37s/it]

Progress: [30/50]
Current Acc.: [80.00%]


 70%|█████████████████████████████████████                | 35/50 [02:37<01:09,  4.65s/it]

Progress: [35/50]
Current Acc.: [82.86%]


 80%|██████████████████████████████████████████▍          | 40/50 [02:59<00:41,  4.20s/it]

Progress: [40/50]
Current Acc.: [82.50%]


 90%|███████████████████████████████████████████████▋     | 45/50 [03:16<00:18,  3.76s/it]

Progress: [45/50]
Current Acc.: [82.22%]


100%|█████████████████████████████████████████████████████| 50/50 [03:33<00:00,  4.28s/it]


Progress: [50/50]
Current Acc.: [82.00%]
Saved to My_prompting_0.txt | Accuracy: 82.00%

=== Running My Prompt - 3-shot ===


 10%|█████▍                                                | 5/50 [00:17<02:37,  3.51s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██████████▌                                          | 10/50 [00:39<03:03,  4.58s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|███████████████▉                                     | 15/50 [01:06<03:05,  5.29s/it]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|█████████████████████▏                               | 20/50 [01:29<02:24,  4.82s/it]

Progress: [20/50]
Current Acc.: [75.00%]


 50%|██████████████████████████▌                          | 25/50 [01:47<01:31,  3.65s/it]

Progress: [25/50]
Current Acc.: [76.00%]


 60%|███████████████████████████████▊                     | 30/50 [02:05<01:11,  3.56s/it]

Progress: [30/50]
Current Acc.: [80.00%]


 70%|█████████████████████████████████████                | 35/50 [02:23<00:53,  3.58s/it]

Progress: [35/50]
Current Acc.: [82.86%]


 80%|██████████████████████████████████████████▍          | 40/50 [02:44<00:42,  4.23s/it]

Progress: [40/50]
Current Acc.: [82.50%]


 90%|███████████████████████████████████████████████▋     | 45/50 [03:04<00:19,  3.90s/it]

Progress: [45/50]
Current Acc.: [82.22%]


100%|█████████████████████████████████████████████████████| 50/50 [03:24<00:00,  4.08s/it]


Progress: [50/50]
Current Acc.: [82.00%]
Saved to My_prompting_3.txt | Accuracy: 82.00%

=== Running My Prompt - 5-shot ===


 10%|█████▍                                                | 5/50 [00:18<02:48,  3.74s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██████████▌                                          | 10/50 [00:39<02:45,  4.13s/it]

Progress: [10/50]
Current Acc.: [90.00%]


 30%|███████████████▉                                     | 15/50 [01:00<02:31,  4.34s/it]

Progress: [15/50]
Current Acc.: [80.00%]


 40%|█████████████████████▏                               | 20/50 [01:20<02:03,  4.13s/it]

Progress: [20/50]
Current Acc.: [80.00%]


 50%|██████████████████████████▌                          | 25/50 [01:43<01:53,  4.54s/it]

Progress: [25/50]
Current Acc.: [80.00%]


 60%|███████████████████████████████▊                     | 30/50 [02:06<01:31,  4.58s/it]

Progress: [30/50]
Current Acc.: [83.33%]


 70%|█████████████████████████████████████                | 35/50 [02:28<01:06,  4.41s/it]

Progress: [35/50]
Current Acc.: [85.71%]


 80%|██████████████████████████████████████████▍          | 40/50 [02:54<00:51,  5.18s/it]

Progress: [40/50]
Current Acc.: [87.50%]


 90%|███████████████████████████████████████████████▋     | 45/50 [03:17<00:24,  4.88s/it]

Progress: [45/50]
Current Acc.: [86.67%]


100%|█████████████████████████████████████████████████████| 50/50 [03:41<00:00,  4.42s/it]

Progress: [50/50]
Current Acc.: [86.00%]
Saved to My_prompting_5.txt | Accuracy: 86.00%





### 보고서 작성하기
#### 아래의 내용이 포함되면 됩니다!

1. Direct Prompting, CoT Prompting, My Prompting을 0 shot, 3 shot, 5 shot 정답률을 표로 보여주세요!
2. CoT Prompting이 Direct Prompting에 비해 왜 좋을 수 있는지에 대해서 서술해주세요!
3. 본인이 작성한 프롬프트 기법이 CoT에 비해서 왜 더 좋을 수 있는지에 대해서 설명해주세요!
4. 최종적으로, `PROMPTING.md`에 보고서를 작성해주세요!