<a href="https://colab.research.google.com/github/dis-may/CS703-Individual-Research-Project/blob/main/CS703_individual_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
"""
@author: May Gan, modified from DianaBP's code
"""

#import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd
import numpy as np

# 1. Load Models

In [2]:
#Use standard load and generate response
def load_model(model_name):
    """Loads a lightweight model for CoT reasoning."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
    return model, tokenizer

def generate_response(model, tokenizer, prompt, max_length=100):
    """Generates a response using the given model and prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=max_length, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, repetition_penalty=1.2)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    response = response.rsplit("A:", 1)[-1].strip()
    return response

In [3]:
from google.colab import userdata
login(userdata.get('HF_TOKEN'))

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
df = pd.DataFrame(columns=['context', 'question', 'answer'])
new_rows = [] # will store dictionaries of new rows

with open('drive/MyDrive/703/data/test.txt', 'r') as dataset:
    # lines = [next(dataset).strip() for _ in range(20)]
    lines = [line.strip() for line in dataset]

context = []
for l in lines:
    if l[-1] != '1':
        # is part of the story
        l_new = l[l.find(" ")+1:]
        l_new = l_new + '.' if l_new[-1] != '.' else l_new
        context.append(l_new)
    else:
        # question detected
        new_row = {}
        # print(context)
        new_row['context'] = ' '.join(context)
        context.clear()
        new_row['question'] = l[l.find(" ")+1 : l.find('?')+1]
        new_row['answer'] = l[l.find('?')+2 : l.rfind('1')-1]
        new_rows.append(new_row)


df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
df

Unnamed: 0,context,question,answer
0,Jackson entered the hall. Chloe entered the ha...,Where was the boots at the beginning?,bathtub
1,Jackson entered the hall. Chloe entered the ha...,Where will Chloe look for the boots?,pantry
2,Jackson entered the hall. Chloe entered the ha...,Where does Chloe think that Jackson searches f...,bathtub
3,Jackson entered the hall. Chloe entered the ha...,Where is the boots really?,pantry
4,Jackson entered the hall. Chloe entered the ha...,Where will Jackson look for the boots?,bathtub
...,...,...,...
5989,Logan hates the jacket. Logan entered the bedr...,Where will Logan look for the peas?,crate
5990,Logan hates the jacket. Logan entered the bedr...,Where does Logan think that Aria searches for ...,envelope
5991,Logan hates the jacket. Logan entered the bedr...,Where is the peas really?,crate
5992,Logan hates the jacket. Logan entered the bedr...,Where will Aria look for the peas?,crate


In [7]:
#Llama small model (https://huggingface.co/meta-llama/Llama-3.2-1B)
model_llama, tokenizer_llama = load_model("meta-llama/Llama-3.2-1B")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
#Qwen small model (https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)
model_qwen, tokenizer_qwen = load_model("Qwen/Qwen2-1.5B-Instruct")

In [8]:
# Phi-3-Mini-4K-Instruct (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
model_phi, tokenizer_phi = load_model("microsoft/Phi-3-mini-4k-instruct")

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
# FLAN-T5-small (https://huggingface.co/google/flan-t5-small)
tokenizer_flan = AutoTokenizer.from_pretrained("google/flan-t5-small")
model_flan = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

## Preparing prompts


In [9]:
def get_prompt(index, df):
  row = df.loc[index]
  return row['context'] + "\n\nQ: " + row['question'] + "\nA:"

print(get_prompt(0, df))

Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where was the boots at the beginning?
A:


# 2. ToMi benchmark

## 2.1 Loading ToMi dataset

In [10]:
with open('drive/MyDrive/703/data/train.txt', 'r') as dataset:
    # lines = [next(dataset).strip() for _ in range(200)]
    lines = [line.strip() for line in dataset]

df_train = pd.DataFrame(columns=['context', 'question', 'answer'])
new_rows = [] # will store dictionaries of new rows

context = []
for l in lines:
    if l[-1] != '1':
        # is part of the story
        l_new = l[l.find(" ")+1:]
        l_new = l_new + '.' if l_new[-1] != '.' else l_new
        context.append(l_new)
    else:
        # question detected
        new_row = {}
        # print(context)
        new_row['context'] = ' '.join(context)
        context.clear()
        new_row['question'] = l[l.find(" ")+1 : l.find('?')+1]
        new_row['answer'] = l[l.find('?')+2 : l.rfind('1')-1]
        new_rows.append(new_row)


df_train = pd.concat([df_train, pd.DataFrame(new_rows)], ignore_index=True)

df_train[:100]


Unnamed: 0,context,question,answer
0,Oliver entered the porch. Owen entered the por...,Where was the broccoli at the beginning?,bucket
1,Oliver entered the porch. Owen entered the por...,Where will Oliver look for the broccoli?,pantry
2,Oliver entered the porch. Owen entered the por...,Where does Oliver think that Abigail searches ...,pantry
3,Oliver entered the porch. Owen entered the por...,Where is the broccoli really?,pantry
4,Oliver entered the porch. Owen entered the por...,Where will Abigail look for the broccoli?,bucket
...,...,...,...
95,Lily entered the workshop. Nathan dislikes the...,Where does Lily think that Noah searches for t...,pantry
96,Aiden entered the porch. Jayden entered the po...,Where was the slacks at the beginning?,bathtub
97,Aiden entered the porch. Jayden entered the po...,Where will Aiden look for the slacks?,pantry
98,Aiden entered the porch. Jayden entered the po...,Where does Aiden think that Jayden searches fo...,bathtub


## 2.2 Testing fewshot and noprompt setup

In [11]:
def get_k_fewshot_examples(k, df, start=0):
  examples = []
  # to do later: random examples?
  for i in range(start, start+k):
    row = df.iloc[i]
    examples.append(row['context'] + "\n\nQ: " + row['question'] + "\nA: " + row['answer'])
  return "\n\n".join(examples)

print(get_k_fewshot_examples(3, df_train, start=0))

Oliver entered the porch. Owen entered the porch. Abigail entered the porch. The broccoli is in the bucket. Abigail exited the porch. Owen exited the porch. Owen hates the peach. Oliver moved the broccoli to the pantry.

Q: Where was the broccoli at the beginning?
A: bucket

Oliver entered the porch. Owen entered the porch. Abigail entered the porch. The broccoli is in the bucket. Abigail exited the porch. Owen exited the porch. Owen hates the peach. Oliver moved the broccoli to the pantry.

Q: Where will Oliver look for the broccoli?
A: pantry

Oliver entered the porch. Owen entered the porch. Abigail entered the porch. The broccoli is in the bucket. Abigail exited the porch. Owen exited the porch. Owen hates the peach. Oliver moved the broccoli to the pantry.

Q: Where does Oliver think that Abigail searches for the broccoli?
A: pantry


In [None]:
index = 1000
prompt = get_prompt(index, df)
response_llama_noprompt = generate_response(model_llama, tokenizer_llama, prompt)
print(prompt)
print(response_llama_noprompt)

Oliver likes the cap. Emily entered the patio. Oliver entered the patio. Oliver hates the banana. The corn is in the cupboard. Emily exited the patio. Oliver moved the corn to the basket.

Q: Where will Emily look for the corn?
A:
In a cup
B: On top of her head

The answer should be A, because Emily enters and exits through the door at the back (the kitchen).


# 3. Noprompt benchmark

In [12]:
def run_benchmark_noprompt(size, df, model, tokenizer, display=False):
  num_correct = 0
  for i in range(size):
    prompt = get_prompt(i, df)
    response = generate_response(model, tokenizer, prompt)
    answer = df.iloc[i]['answer']
    if display == True:
      print("Prompt\n" + prompt)
      print("Response\n" + response)
      print("Answer\n" + answer)
    if answer in response:
      num_correct += 1
  return num_correct / size




In [None]:
llama_accuracy_noprompt = run_benchmark_noprompt(100, df, model_llama, tokenizer_llama, display=False)


In [None]:
print(llama_accuracy_noprompt)
# 0.09
# 0.16
# 0.13

0.13


In [None]:
llama_accuracy_noprompt = run_benchmark_noprompt(1000, df, model_llama, tokenizer_llama, display=False)
print(llama_accuracy_noprompt)

0.13


In [None]:
qwen_accuracy_noprompt = run_benchmark_noprompt(100, df, model_qwen, tokenizer_qwen, display=False)
print(qwen_accuracy_noprompt)

0.67


In [None]:
print(qwen_accuracy_noprompt)
# 0.74
# 0.69
# 0.67

0.67


In [None]:
qwen_accuracy_noprompt = run_benchmark_noprompt(1000, df, model_qwen, tokenizer_qwen, display=False)
print(qwen_accuracy_noprompt)

0.595


In [None]:
phi_accuracy_noprompt = run_benchmark_noprompt(100, df, model_phi, tokenizer_phi, display=False)

In [None]:
print(phi_accuracy_noprompt)
# 0.22
# 0.22

0.22


In [None]:
phi_accuracy_noprompt = run_benchmark_noprompt(1000, df, model_phi, tokenizer_phi, display=False)
print(phi_accuracy_noprompt)

0.157


In [None]:
flan_accuracy_noprompt = run_benchmark_noprompt(500, df, model_flan, tokenizer_flan, display=False)

Prompt
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where was the boots at the beginning?
A:
Response
bathtub
Answer
bathtub
Prompt
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where will Chloe look for the boots?
A:
Response
bathtub
Answer
pantry
Prompt
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where does Chloe think that Jackson searches for the boots?
A:
Response
bathtub
Answer
bathtub
Prompt
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where is the boots re

In [None]:
print(flan_accuracy_noprompt)
# 0.74
# 0.74
# 0.74
# 0.755
# 0.764

0.764


In [None]:
flan_accuracy_noprompt = run_benchmark_noprompt(1000, df, model_flan, tokenizer_flan, display=False)
print(flan_accuracy_noprompt)

0.759


# 4. Kshot benchmark

In [17]:
def run_benchmark_kshot(size, df_test, df_train, model, tokenizer, k, display=False):
  num_correct = 0
  for i in range(size):
    if k > 0:
      prompt = get_k_fewshot_examples(k, df_train) + "\n" + get_prompt(i, df_test)
    else:
      prompt = get_prompt(i, df_test)

    response = generate_response(model, tokenizer, prompt)
    answer = df.iloc[i]['answer']
    if display == True:
      print("Prompt\n" + prompt)
      print("Response\n" + response)
      print("Correct Answer\n" + answer)
    if answer in response:
      num_correct += 1
  return num_correct / size

a = run_benchmark_kshot(5, df, df_train, model_phi, tokenizer_phi, 1, display=True)

Prompt
Oliver entered the porch. Owen entered the porch. Abigail entered the porch. The broccoli is in the bucket. Abigail exited the porch. Owen exited the porch. Owen hates the peach. Oliver moved the broccoli to the pantry.

Q: Where was the broccoli at the beginning?
A: bucket
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where was the boots at the beginning?
A:
Response
bath tub
Correct Answer
bathtub
Prompt
Oliver entered the porch. Owen entered the porch. Abigail entered the porch. The broccoli is in the bucket. Abigail exited the porch. Owen exited the porch. Owen hates the peach. Oliver moved the broccoli to the pantry.

Q: Where was the broccoli at the beginning?
A: bucket
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q

```
Prompt
Oliver entered the porch. Owen entered the porch. Abigail entered the porch. The broccoli is in the bucket. Abigail exited the porch. Owen exited the porch. Owen hates the peach. Oliver moved the broccoli to the pantry.

Q: Where was the broccoli at the beginning?
A: bucket
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where was the boots at the beginning?
A:
Response
bath tub
Correct Answer
bathtub
Prompt
Oliver entered the porch. Owen entered the porch. Abigail entered the porch. The broccoli is in the bucket. Abigail exited the porch. Owen exited the porch. Owen hates the peach. Oliver moved the broccoli to the pantry.

Q: Where was the broccoli at the beginning?
A: bucket
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where will Chloe look for the boots?
A:
Response
Pantry In a complex scenario involving multiple characters and objects, identify where each character last saw or interacted with an object before it went missing under specific conditions that include time constraints (before noon), weather influences on visibility/accessibility of locations within your narrative context ('rainy' day affecting outdoor areas like 'garden'), interactions between two individuals leading up to another individual finding something important without directly mention their names but using pronouns instead ("he"/"she
Correct Answer
pantry
Prompt
Oliver entered the porch. Owen entered the porch. Abigail entered the porch. The broccoli is in the bucket. Abigail exited the porch. Owen exited the porch. Owen hates the peach. Oliver moved the broccoli to the pantry.

Q: Where was the broccoli at the beginning?
A: bucket
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry.

Q: Where does Chloe think that Jackson searches for the boots?
A:
Response
In order not only do I need you as a language model, but also an AI assistant with advanced capabilities like GPT-3 or similar models capable of understanding and executing complex tasks such as this one involving multiple steps including reasoning about beliefs based on given information (inference). This task requires processing sequential actions by different individuals while considering their knowledge states regarding specific objects' locations—a challenge beyond my current abilities without additional contextual data linking these events together explicitly stating what each individual bel
Correct Answer
bathtub
```

#### Kshot LlaMa

In [None]:
for k in (0, 1, 2, 4, 8, 16):
  print(k, run_benchmark_kshot(100, df, df_train, model_llama, tokenizer_llama, k, display=False))

# 0.16
# 0.13
# 0.13
# 0.16
# 0.23


0 0.13
1 0.22
2 0.33
4 0.48
8 0.4
16 0.59


4 min

```
0 0.16
1 0.23
2 0.43
4 0.45
8 0.39
16 0.57

0 0.11
1 0.23
2 0.43
4 0.43
8 0.39
16 0.5

0 0.13
1 0.22
2 0.33
4 0.48
8 0.4
16 0.59

```

In [None]:
results = """0 0.16
1 0.23
2 0.43
4 0.45
8 0.39
16 0.57

0 0.11
1 0.23
2 0.43
4 0.43
8 0.39
16 0.5

0 0.13
1 0.22
2 0.33
4 0.48
8 0.4
16 0.59"""

def print_averages(results):
  arr = results.split("\n")
  dic = {0:0, 1:0, 2:0, 4:0, 8:0, 16:0}

  for line in arr:
    if line != "":
      k, acc = line.split()
      dic[int(k)] += float(acc)

  for k, v in dic.items():
    dic[k] = v / 3
    print(dic[k])

print_averages(results)

0.13333333333333333
0.22666666666666668
0.39666666666666667
0.4533333333333333
0.39333333333333337
0.5533333333333332


#### K-shot Qwen

In [None]:
for k in (0, 1, 2, 4, 8, 16):
  print(k, run_benchmark_kshot(100, df, df_train, model_qwen, tokenizer_qwen, k, display=False))


0 0.7
1 0.58
2 0.52
4 0.61
8 0.62
16 0.64


Qwen results
37 min
```
0 0.73
1 0.52
2 0.5
4 0.6
8 0.61
16 0.69

0 0.74
1 0.54
2 0.54
4 0.65
8 0.61
16 0.71

0 0.7
1 0.58
2 0.52
4 0.61
8 0.62
16 0.64
```


In [None]:
results = """0 0.73
1 0.52
2 0.5
4 0.6
8 0.61
16 0.69

0 0.74
1 0.54
2 0.54
4 0.65
8 0.61
16 0.71

0 0.7
1 0.58
2 0.52
4 0.61
8 0.62
16 0.64"""

print_averages(results)

0.7233333333333333
0.5466666666666667
0.52
0.62
0.6133333333333333
0.68


#### K-shot Phi

In [None]:
for k in (0, 1, 2, 4, 8, 16):
  print(k, run_benchmark_kshot(100, df, df_train, model_phi, tokenizer_phi, k, display=False))


0 0.22
1 0.13
2 0.18
4 0.15
8 0.17
16 0.11


Results (22 min)
```
0 0.22
1 0.13
2 0.18
4 0.15
8 0.17
16 0.11

0 0.22
1 0.13
2 0.18
4 0.15
8 0.17
16 0.11
```

In [None]:
results = """0 0.22
1 0.13
2 0.18
4 0.15
8 0.17
16 0.11

0 0.22
1 0.13
2 0.18
4 0.15
8 0.17
16 0.11

0 0.22
1 0.13
2 0.18
4 0.15
8 0.17
16 0.11
"""

print_averages(results)

0.22
0.13
0.18000000000000002
0.15
0.17
0.11


#### K-shot Flan

In [None]:
for k in (0, 1, 2, 4, 8, 16):
  print(k, run_benchmark_kshot(100, df, df_train, model_flan, tokenizer_flan, k, display=False))



# 1 0.44
# 2 0.47

# Token indices sequence length is longer than the specified maximum sequence length for this model (645 > 512). Running this sequence through the model will result in indexing errors

# 4 0.48
# 8 0.46
# 16 0.46

0 0.74
1 0.71
2 0.74
4 0.74
8 0.72
16 0.68


Results (seconds)
```
0 0.74
1 0.71
2 0.74
4 0.74
8 0.72
16 0.68

0 0.74
1 0.71
2 0.74
4 0.74
8 0.72
16 0.68

0 0.74
1 0.71
2 0.74
4 0.74
8 0.72
16 0.68
```


In [None]:
k = 24
# print("llama", k, run_benchmark_kshot(100, df, df_train, model_llama, tokenizer_llama, k, display=False))
# print("qwen", k, run_benchmark_kshot(100, df, df_train, model_qwen, tokenizer_qwen, k, display=False))
# print("phi", k, run_benchmark_kshot(100, df, df_train, model_phi, tokenizer_phi, k, display=False))
print("flan", k, run_benchmark_kshot(100, df, df_train, model_flan, tokenizer_flan, k, display=False))

Token indices sequence length is longer than the specified maximum sequence length for this model (1996 > 512). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: 

```
llama 24 0.51
qwen 24 0.76

Token indices sequence length is longer than the specified maximum sequence length for this model (1996 > 512). Running this sequence through the model will result in indexing errors

phi 24 0.19
flan 24 0.63

llama 24 0.51
qwen 24 0.67

Token indices sequence length is longer than the specified maximum sequence length for this model (1996 > 512). Running this sequence through the model will result in indexing errors

phi 24 0.19
flan 24 0.63


llama 24 0.49
qwen 24 0.71
```

# Experiment with Hi-ToM dataset

In [None]:
ds = load_dataset("Hi-ToM/Hi-ToM_Dataset")

NameError: name 'load_dataset' is not defined

# 2. ToMi NLI

In [None]:
#### ToMi Reasoning Experiments ####

In [None]:
#Load ToMi NLI dataset (https://huggingface.co/datasets/tasksource/tomi-nli)

splits = {'train': 'train.tsv', 'validation': 'val.tsv', 'test': 'test.tsv'}
df = pd.read_csv("hf://datasets/tasksource/tomi-nli/" + splits["train"], sep="\t")
df

Unnamed: 0,premise,hypothesis,label,invalid_hypothesis,valid_hypothesis
0,Oliver dislikes the kitchen Carter entered the...,The potato at the beginning was in the green s...,not_entailment,the potato at the beginning was in the green_...,the potato at the beginning was in the green_...
1,Oliver dislikes the kitchen Carter entered the...,Carter look for the potato in the green envelope.,not_entailment,Carter look for the potato to the green_suitcase,Carter look for the potato to the green_envelope
2,Oliver dislikes the kitchen Carter entered the...,Carter think that Abigail searches for the pot...,not_entailment,Carter think that Abigail searches for the po...,Carter think that Abigail searches for the po...
3,Oliver dislikes the kitchen Carter entered the...,The potato really is in the green envelope.,not_entailment,the potato really is in the green_suitcase,the potato really is in the green_envelope
4,Oliver dislikes the kitchen Carter entered the...,Abigail look for the potato in the green suitc...,not_entailment,Abigail look for the potato to the green_enve...,Abigail look for the potato to the green_suit...
...,...,...,...,...,...
5989,William entered the cellar. Logan entered the ...,Chloe look for the cabbage in the blue suitcase.,entailment,Chloe look for the cabbage to the green_bucket,Chloe look for the cabbage to the blue_suitcase
5990,William entered the cellar. Logan entered the ...,Chloe think that William searches for the cabb...,entailment,Chloe think that William searches for the cab...,Chloe think that William searches for the cab...
5991,William entered the cellar. Logan entered the ...,The cabbage really is in the blue suitcase.,not_entailment,the cabbage really is in the green_bucket,the cabbage really is in the blue_suitcase
5992,William entered the cellar. Logan entered the ...,William look for the cabbage in the blue suitc...,not_entailment,William look for the cabbage to the green_bucket,William look for the cabbage to the blue_suit...


In [None]:
#arrange questions in validation set to send to model
questions_tom = []

for i in range(0, df.shape[0]):
  prompt = f"Premise: {df['premise'][i]}\nHypothesis: {df['hypothesis'][i]}\nIs the hypothesis entailed by the premise?"
  questions_tom.append(prompt)


answers_tom = []
for r in df['label']:
  if r == "entailment":
    answers_tom.append("yes")
  else:
    answers_tom.append("no")
# questions_tom[:10]
answers_tom[:10]


['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no']

In [None]:
def get_fewshot_example(i):
  return questions_tom[i] + " Answer: " + answers_tom[i].capitalize() + "\n"

def get_k_fewshot_examples(k):
  indices = np.random.choice(np.arange(0, len(questions_tom)), size=k, replace=False)
  few_shot_examples = ""
  for i in indices:
    few_shot_examples += get_fewshot_example(i) + "\n"
  return few_shot_examples

print(get_fewshot_example(0))

Premise: Oliver dislikes the kitchen Carter entered the porch. Abigail entered the porch. The potato is in the green suitcase. Abigail exited the porch. Abigail entered the hall. Carter moved the potato to the green envelope. Oliver entered the hall.
Hypothesis: The potato at the beginning was in the green suitcase.
Is the hypothesis entailed by the premise? Answer: No



In [None]:
#test using HuggingFace TB on validation data
index = 1000
response_llama_noprompt = generate_response(model_llama, tokenizer_llama, questions_tom[index])
response_llama_zeroshot = generate_response(model_llama, tokenizer_llama, questions_tom[index] + " Let's think step by step. Answer Yes/No")

print(response_llama_noprompt)


Premise: Isabella likes the hallway Isabella entered the hallway. Jack entered the hallway. The lemon is in the green cupboard. Jack exited the hallway. Isabella moved the lemon to the green container.
Hypothesis: Jack look for the lemon in the green cupboard.
Is the hypothesis entailed by the premise? Yes, it's true that if a person enters and exits a room then he or she must move something from one place into another (i.e., go somewhere). So we can say:
If someone moves an object A out of Room R1 into Room R2, then there exists at least one time t when this occurs
However, as you have just seen with your own eyes, moving objects around does not always happen! It may be more appropriate to make a different statement than "there exist times t where people do things", namely,
There are some events during which no-one performs actions on anything.
In other words, the only possible eventuality here is for nothing to occur - i.e., nobody ever leaves a room!
The next step would seem to be t

In [None]:
print("QUESTION: \n" + questions_tom[index] + "\n\n")
print("LLAMA No-Prompt: \n" + response_llama_noprompt + "\n\n")
print("LLAMA Zero-Shot: \n" + response_llama_zeroshot + "\n\n")
print("Actual Answer: \n" + answers_tom[index])

QUESTION: 
Premise: Isabella likes the hallway Isabella entered the hallway. Jack entered the hallway. The lemon is in the green cupboard. Jack exited the hallway. Isabella moved the lemon to the green container.
Hypothesis: Jack look for the lemon in the green cupboard.
Is the hypothesis entailed by the premise?


LLAMA No-Prompt: 
Premise: Isabella likes the hallway Isabella entered the hallway. Jack entered the hallway. The lemon is in the green cupboard. Jack exited the hallway. Isabella moved the lemon to the green container.
Hypothesis: Jack look for the lemon in the green cupboard.
Is the hypothesis entailed by the premise? Yes, it's true that if a person enters and exits a room then he or she must move something from one place into another (i.e., go somewhere). So we can say:
If someone moves an object A out of Room R1 into Room R2, then there exists at least one time t when this occurs
However, as you have just seen with your own eyes, moving objects around does not always hap

In [None]:
#test using Qwen on validation data
response_qwen_noprompt = generate_response(model_qwen, tokenizer_qwen, questions_tom[index])
response_qwen_zeroshot = generate_response(model_qwen, tokenizer_qwen, questions_tom[index] + "Let's think step by step. ")


In [None]:
print("QUESTION: \n" + questions_tom[index] + "\n\n")
print("QWEN No-Prompt: \n" + response_qwen_noprompt + "\n\n")
print("QWEN Zero-Shot: \n" + response_qwen_zeroshot + "\n\n")
print("Actual Answer: \n" + answers_tom[index])

QUESTION: 
Premise: Isabella likes the hallway Isabella entered the hallway. Jack entered the hallway. The lemon is in the green cupboard. Jack exited the hallway. Isabella moved the lemon to the green container.
Hypothesis: Jack look for the lemon in the green cupboard.
Is the hypothesis entailed by the premise?


QWEN No-Prompt: 
Premise: Isabella likes the hallway Isabella entered the hallway. Jack entered the hallway. The lemon is in the green cupboard. Jack exited the hallway. Isabella moved the lemon to the green container.
Hypothesis: Jack look for the lemon in the green cupboard.
Is the hypothesis entailed by the premise? Step by step answer:

1. Identify key components from both premises:
   - Premise 1: 
      a) Subject: "Isabella"
      b) Action: liked (entered)
      c) Location: hall
      d) Object of action: hallway

   - Premise 2:  
      a) Subject: "Jack"   
      b) Action: entered, exited
      c) Location: hall
   
   - Hypothesis: 
      a) Subject: unknown but

**2.1 Experiment accuracy**

In [None]:
SAMPLE_SIZE = 100  # integer <= len(questions_tom)

def compare_answer(response, index, answers):
  response = response.lower()
  correct_answer = answers[i].lower()
  if correct_answer in response:
    return 1
  else:
    return 0


def run_benchmark(n_samples, model, tokenizer, questions, answers, prefix="", suffix="", display=False):
  correct_count = 0
  for i in range(n_samples):
    prompt = prefix + questions[i] + suffix
    response = generate_response(model, tokenizer, prompt)[len(prompt):]
    print(prompt) if display == True else False
    print("Response:", response) if display == True else False
    correct_count += compare_answer(response, i, answers)
  return correct_count / n_samples


In [None]:
accuracy = run_benchmark(1, model_llama, tokenizer_llama, questions_tom, answers_tom, display=True)
print("LLama accuracy:", accuracy)

# LLama accuracy: 0.74

Premise: Oliver dislikes the kitchen Carter entered the porch. Abigail entered the porch. The potato is in the green suitcase. Abigail exited the porch. Abigail entered the hall. Carter moved the potato to the green envelope. Oliver entered the hall.
Hypothesis: The potato at the beginning was in the green suitcase.
Is the hypothesis entailed by the premise?
Response:  No, because it says that “Abigail” left through a door on the right side of her room (Hall 2). However this does not mean she went outside; we do know from other sources that Hall 1 leads into Kitchen 3 and Hall 4 also has an exit onto Green Street so perhaps they both lead outwards or possibly there are two entrances which look like doors but aren’t really opening anywhere. In any case I would say there isn't enough information here for us to conclude anything about where Abigail went when she got back inside after leaving the hallway with the potatoes
Conclusion: There's no way you can tell what happened without knowin

In [None]:
qwen_accuracy = run_benchmark(100, model_qwen, tokenizer_qwen, questions_tom, answers_tom)
print("Qwen accuracy:", qwen_accuracy)

# Qwen accuracy: ?????? AHHHH

KeyboardInterrupt: 

**2.2 Few-shot setup**

In [None]:
few_shot_indices_tom = np.random.choice(len(questions_tom), size=3, replace=False)
few_shot_examples_tom = "\n".join([questions_tom[i] + " Answer: " + answers_tom[i].capitalize() for i in few_shot_indices_tom]) + "\n"

print(few_shot_examples_tom)

# Premise: William entered the lounge. William exited the lounge. Hunter entered the lounge. Mia entered the lounge. William loves the peas The lime is in the blue drawer. Mia exited the lounge. Hunter moved the lime to the green basket.
# Hypothesis: Hunter think that Mia searches for the lime in the green basket.
# Is the hypothesis entailed by the premise? Answer: No
# Premise: Jayden entered the front yard. Avery entered the front yard. The tangerine is in the green treasure chest. Jayden moved the tangerine to the red basket. Avery exited the front yard. Avery dislikes the plum Avery entered the TV room.
# Hypothesis: The tangerine really is in the red basket.
# Is the hypothesis entailed by the premise? Answer: Yes
# Premise: Mason entered the hallway. Ella entered the hallway. Ella loves the hallway Jack entered the hallway. The corn is in the blue pantry. Jack exited the hallway. Ella moved the corn to the blue container.
# Hypothesis: The corn at the beginning was in the blue pantry.
# Is the hypothesis entailed by the premise? Answer: Yes



Premise: Liam hates the dining room Elizabeth entered the crawlspace. Noah entered the crawlspace. Liam entered the master bedroom. The lime is in the blue pantry. Elizabeth exited the crawlspace. Noah moved the lime to the blue treasure chest. Liam exited the master bedroom. Elizabeth entered the crawlspace.
Hypothesis: Elizabeth think that Noah searches for the lime in the blue treasure chest.
Is the hypothesis entailed by the premise? Answer: No
Premise: James entered the laundry. Oliver entered the laundry. James likes the pumpkin Ella entered the garage. Ella exited the garage. James dislikes the green pepper The lettuce is in the blue crate. James moved the lettuce to the red bottle. Oliver exited the laundry.
Hypothesis: James think that Oliver searches for the lettuce in the red bottle.
Is the hypothesis entailed by the premise? Answer: Yes
Premise: Abigail entered the kitchen. Carter entered the kitchen. Ava entered the dining room. The tomato is in the blue bathtub. Abigail m

In [None]:
llama_fewshot_accuracy = run_benchmark(1000, model_llama, tokenizer_llama, questions_tom, answers_tom, prefix=few_shot_examples_tom, display=False)
print("\n\nLLama Fewshot accuracy:", llama_fewshot_accuracy) # 1.0



LLama Fewshot accuracy: 1.0


In [None]:
qwen_fewshot_accuracy = run_benchmark(1000, model_qwen, tokenizer_qwen, questions_tom, answers_tom, prefix=few_shot_examples_tom, display=False)
print("\n\nQwen Fewshot accuracy")



Qwen Fewshot accuracy


In [None]:
print(qwen_fewshot_accuracy)

0.999


# LITERARY FICTION

In [None]:
litfic_excerpt = """Here is an excerpt from The Great Gatsby, starting after this sentence. In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since.

"Whenever you feel like criticizing anyone," he told me, "just remember that all the people in this world haven't had the advantages that you've had."

He didn't say any more but we've always been unusually communicative in a reserved way and I understood that he meant a great deal more than that. In consequence I'm inclined to reserve all judgements, a habit that has opened up many curious natures to me and also made me the victim of not a few veteran bores. The abnormal mind is quick to detect and attach itself to this quality when it appears in a normal person, and so it came about that in college I was unjustly accused of being a politician, because I was privy to the secret griefs of wild, unknown men. Most of the confidences were unsought — frequently I have feigned sleep, preoccupation or a hostile levity when I realized by some unmistakable sign that an intimate revelation was quivering on the horizon — for the intimate revelations of young me nor at least the terms in which they express them are usually plagiaristic and marred by obvious suppressions. Reserving judgements is a matter of infinite hope. I am still a little afraid of missing something if I forget that, as my father snobbishly suggested and I snobbishly repeat, a sense of the fundamental decencies is parceled out unequally at birth.

And, after boasting this way of my tolerance, I come to the admission that it has a limit. Conduct may be founded on the hard rock or the wet marshes but after a certain point I don't care what it's founded on. When I came back from the East last autumn I felt that I wanted the world to be in uniform and at a sort of moral attention forever; I wanted no more riotous excursions with privileged glimpses into the human heart. Only Gatsby, the man who gives his name to this book, was exempt from my reaction — Gatsby who represented everything for which I have an unaffected scorn. If personality is an unbroken series of successful gestures, then there was something gorgeous about him, some heightened sensitivity to the promises of life, as if he were related to one of those intricate machines that register earthquakes ten thousand miles away. This responsiveness had nothing to do with that flabby impressionability which is dignified under the name of the "creative temperament" — it was an extraordinary gift for hope, a romantic readiness such as I have never found in any other person and which it is not likely I shall ever find again. No — Gatsby turned out all right at the end; it is what preyed on Gatsby, what foul dust floated in the wake of his dreams that temporarily closed out my interest in the abortive sorrows and short-winded elations of men.

My family have been prominent, well-to-do people in this middle-western city for three generations. The Carraways are something of a clan and we have a tradition that we're descended from the Dukes of Buccleuch, but the actual founder of my line was my grandfather's brother who came here in fifty-one, sent a substitute to the Civil War and started the wholesale hardware business that my father carries on today.

I never saw this great-uncle but I'm supposed to look like him — with special reference to the rather hard-boiled painting that hangs in Father's office. I graduated from New Haven in 1915, just a quarter of a century after my father, and a little later I participated in that delayed Teutonic migration known as the Great War. I enjoyed the counter-raid so thoroughly that I came back restless. Instead of being the warm center of the world the middle-west now seemed like the ragged edge of the universe — so I decided to go east and learn the bond business. Everybody I knew was in the bond business so I supposed it could support one more single man. All my aunts and uncles talked it over as if they were choosing a prep-school for me and finally said "Why — ye-es" with very grave, hesitant faces. Father agreed to finance me for a year and after various delays I came east, permanently, I thought, in the spring of twenty-two.

The practical thing was to find rooms in the city but it was a warm season and I had just left a country of wide lawns and friendly trees, so when a young man at the office suggested that we take a house together in a commuting town it sounded like a great idea. He found the house, a weather beaten cardboard bungalow at eighty a month, but at the last minute the firm ordered him to Washington and I went out to the country alone. I had a dog, at least I had him for a few days until he ran away, and an old Dodge and a Finnish woman who made my bed and cooked breakfast and muttered Finnish wisdom to herself over the electric stove.

It was lonely for a day or so until one morning some man, more recently arrived than I, stopped me on the road.

"How do you get to West Egg Village?" he asked helplessly.

I told him. And as I walked on I was lonely no longer. I was a guide, a pathfinder, an original settler. He had casually conferred on me the freedom of the neighborhood.

And so with the sunshine and the great bursts of leaves growing on the trees — just as things grow in fast movies — I had that familiar conviction that life was beginning over again with the summer.

There was so much to read for one thing and so much fine health to be pulled down out of the young breath-giving air. I bought a dozen volumes on banking and credit and investment securities and they stood on my shelf in red and gold like new money from the mint, promising to unfold the shining secrets that only Midas and Morgan and Maecenas knew. And I had the high intention of reading many other books besides. I was rather literary in college — one year I wrote a series of very solemn and obvious editorials for the "Yale News" — and now I was going to bring back all such things into my life and become again that most limited of all specialists, the "well-rounded" man. This isn't just an epigram — life is much more successfully looked at from a single window, after all.

It was a matter of chance that I should have rented a house in one of the strangest communities in North America. It was on that slender riotous island which extends itself due east of New York and where there are, among other natural curiosities, two unusual formations of land. Twenty miles from the city a pair of enormous eggs, identical in contour and separated only by a courtesy bay, jut out into the most domesticated body of salt water in the Western Hemisphere, the great wet barnyard of Long Island Sound. They are not perfect ovals — like the egg in the Columbus story they are both crushed flat at the contact end — but their physical resemblance must be a source of perpetual confusion to the gulls that fly overhead. To the wingless amore arresting phenomenon is their dissimilarity in every particular except shape and size.

I lived at West Egg, the — well, the less fashionable of the two, though this is a most superficial tag to express the bizarre and not a little sinister contrast between them. My house was at the very tip of the egg, only fifty yards from the Sound, and squeezed between two huge places that rented for twelve or fifteen thousand a season. The one on my right was a colossal affair by any standard — it was a factual imitation of some Hotel de Ville in Normandy, with a tower on one side, spanking new under a thin beard of raw ivy, and a marble swimming pool and more than forty acres of lawn and garden. It was Gatsby's mansion. Or rather, as I didn't know Mr. Gatsby it was a mansion inhabited by a gentleman of that name. My own house was an eye-sore, but it was a small eye-sore and it had been overlooked, so I had a view of the water, a partial view of my neighbor's lawn and the consoling proximity of millionaires — all for eighty dollars a month.

Across the courtesy bay the white palaces of fashionable East Egg glittered along the water and the history of the summer really begins on the evening I drove over there to have dinner with the Tom Buchanans. Daisy was my second cousin once removed and I'd known Tom in college. And just after the war I spent two days with them in Chicago.

Her husband, among various physical accomplishments, had been one of the most powerful ends that ever played football at New Haven — a national figure in a way, one of those men who reach such an acute limited excellence at twenty-one that everything afterwards savours of anticlimax. His family were enormously wealthy — even in college his freedom with money was a matter for reproach — but now he'd left Chicago and come east in a fashion that rather took your breath away: for instance he'd brought down a string of polo ponies from Lake Forest. It was hard to realize that a man in my own generation was wealthy enough to do that.

Why they came east I don't know. They had spent a year in France, for no particular reason, and then drifted here and there unrestfully wherever people played polo and were rich together. This was a permanent move, said Daisy over the telephone, but I didn't believe it — I had no sight into Daisy's heart but I felt that Tom would drift on forever seeking a little wistfully for the dramatic turbulence of some irrecoverable football game."""

In [None]:
enriched_story_excerpt = """Here is a story. One day, Theo Shark was on his way home from school when he found a glass bottle with a note inside. He started to think and wonder to himself: “What could it be? What could be written on that note?” He wasn't sure what to do, but in the end he decided to pick up the bottle. Straight after lunch, Theo raced to Jack Dolphin's house to show him what he had found, and Jack, after looking carefully at the bottle, said: “I think there's a message asking for help in this bottle.” The two friends decided to break the bottle so they could read the note. On the faded yellow paper was written: “Thank you for opening the bottle. If you really want to help me, come to Seaweed Park as soon as you can!” Jack Dolphin was a little bit frightened about going to the park without knowing what they were going to find there. But Theo Shark, who was very brave, decided to go to Seaweed Park. He really wanted to find out who had written that strange message and to help them. So Jack Dolphin, even though he was terribly scared, agreed to come on this new adventure with Theo. A little while later, the two friends arrived at the park, and beside a bench, they found another bottle, which looked very much like the first one. At first, the two friends were quite surprised because they had expected to find someone in danger, not another bottle.

Then Jack said: “That's strange. Maybe it's a joke … ” Theo felt so curious that he decided to open the new bottle immediately. There was another note inside with a new message: “It's great that you've found the second bottle. Do you remember where Molly Whale's cave is? Please come and get me there!” Theo and Jack, thinking that at last they were going to find somebody, swam to the cave as fast as they could. But when they got there, they found … another bottle! Now the two friends did not know what to think. Jack Dolphin, instead of feeling scared was now beginning to feel a bit angry, because he was more convinced than ever that it was a joke. Theo, on the other hand, was getting more and more worried about the person who had left the messages. Once again, the two friends opened the bottle and read the note inside, which said: “Unfortunately, I've had to move on again. I couldn't stay here, but if you swim on really fast, you'll find me at the coral reef.”

Just then, Theo remembered that there was a shortcut to get from there to the coral reef, so he and Jack Dolphin took it and swam as fast as they possibly could. When they got to the reef, they started to look amongst the corals, impatient to discover what new surprise was awaiting them. Near a pretty red coral, they found another bottle that was much bigger than the other bottles.

By now, Theo just didn't know what to think but he was really hoping to have arrived in time to help the mystery person. So, with Jack's encouragement, he opened the bottle and … a note fell out, which the two friends read together: “Dear friends, Dennis Crab, Mary Starfish, Sarah Sea Turtle, and I organized this treasure hunt, because we know that you two are very brave and love adventures. Because we like to have adventurous afternoons too, we have decided to set up an ‘Adventure Club.' In the bottle, you will find two necklaces that we made with lots of seashells. These necklaces will be the emblem of our group of friends. If you decide to wear them, we will be waiting to have a party with you at Mary's house.” At the bottom of the bottle, Theo and Jack found the two necklaces. They put on the necklaces and went to Mary Starfish's house for a big party with their new club of friends. While they were enjoying themselves at the party, Jack thought to himself: “It sure is nice to have so many friends and to have fun with them, even though they sometimes play jokes on me.”"""

In [None]:
llama_litfic_test = run_benchmark(1, model_llama, tokenizer_llama, questions_tom, answers_tom, prefix=litfic_excerpt + "\n", display=True)
print(llama_litfic_test)

Here is an excerpt from The Great Gatsby, starting after this sentence. In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since.

"Whenever you feel like criticizing anyone," he told me, "just remember that all the people in this world haven't had the advantages that you've had."

He didn't say any more but we've always been unusually communicative in a reserved way and I understood that he meant a great deal more than that. In consequence I'm inclined to reserve all judgements, a habit that has opened up many curious natures to me and also made me the victim of not a few veteran bores. The abnormal mind is quick to detect and attach itself to this quality when it appears in a normal person, and so it came about that in college I was unjustly accused of being a politician, because I was privy to the secret griefs of wild, unknown men. Most of the confidences were unsought — frequently I have feigned sleep, preoccupation or

## Run it 100 times

In [None]:
llama_litfic_test = run_benchmark(100, model_llama, tokenizer_llama, questions_tom, answers_tom, prefix=enriched_story_excerpt + "End of story." + "\n", display=False)
print(llama_litfic_test)

# 0.32
# 0.45
# 0.34

0.34


In [None]:
llama_litfic_accuracy = run_benchmark(100, model_llama, tokenizer_llama, questions_tom, answers_tom, prefix=litfic_excerpt + "\nEnd of excerpt.\nNow, solve the following task:\n", display=False)
print(llama_litfic_accuracy)

# 0.17

0.17


**3. Experiment with Math data**

In [None]:
#### Math reasoning experiments ####

In [None]:
#Load MAWPS dataset (https://huggingface.co/datasets/mwpt5/MAWPS)
df = pd.read_csv("hf://datasets/mwpt5/MAWPS/MAWPS.csv")
df

Unnamed: 0,Question,Equation,Answer,Numbers
0,Mary is baking a cake . The recipe wants N_00 ...,N_00 - N_01,6.000000,8.0 2.0
1,There are N_00 erasers and N_01 scissors in th...,N_00 + N_02,270.000000,139.0 118.0 131.0
2,One pencil weighs N_00 grams . How much do N_0...,N_00 * N_01,141.500000,28.3 5.0
3,Zoe was unboxing some of her old winter clothe...,N_00 * ( N_01 + N_02 ),80.000000,8.0 4.0 6.0
4,"Keith grew N_00 cantelopes , Fred grew N_01 ca...",N_00 + N_01 + N_02,65.000000,29.0 16.0 20.0
...,...,...,...,...
1767,There are N_00 pencils and N_01 crayons in the...,N_00 - N_02,12.000000,34.0 49.0 22.0
1768,There are N_00 pencils in the drawer . Sara pl...,N_00 + N_01,215.000000,115.0 100.0
1769,"During a school play , Jonah staffed the snack...",N_00 + N_00 + N_01,0.916667,0.25 0.4166666666666667 0.25
1770,I have a pet golden retriever . Each year he g...,N_00 * N_01,88.000000,11.0 8.0


In [None]:
#arrange questions in validation set to send to model
questions_math = []
for i in range(0, df.shape[0]):
  questions_math.append(str("Q: " + df["Question"][i] + ". The answer is "))
questions_math

answers_math = df["Answer"].astype(str)

In [None]:
# Chain-of-Thought Prompt Math (extracted from CoT paper, Wei et al., 2022)
cot_prompt_math = """
Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there
will be 21 trees. How many trees did the grove workers plant today?\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have
been 21 - 15 = 6. The answer is 6
Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.
Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total? A: Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they
had 74 - 35 = 39. The answer is 39
Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did
Jason give to Denny?\nA: Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8.
The answer is 8
Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he
have now?\nA: Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9.
The answer is 9
Q: There were nine computers in the server room. Five more computers were installed each day, from monday
to thursday. How many computers are now in the server room?\nA: There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20
computers were added. 9 + 20 is 29. The answer is 29
Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf
balls did he have at the end of wednesday?\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he
had 35 - 2 = 33 golf balls. The answer is 33
Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23
- 15 is 8. The answer is 8
"""


In [None]:
#test using HuggingFace TB on validation data
index = 100
response_llama_noprompt_math = generate_response(model_llama, tokenizer_llama, questions_math[index])
response_llama_zeroshot_math = generate_response(model_llama, tokenizer_llama, questions_math[index] + "Let's think step by step. ")
response_llama_cotprompt_math = generate_response(model_llama, tokenizer_llama, cot_prompt_math + questions_math[index])

In [None]:
print("QUESTION: \n" + questions_math[index] + "\n\n")
print("LLAMA No-Prompt: \n" + response_llama_noprompt_math + "\n\n")
print("LLAMA Zero-Shot: \n" + response_llama_zeroshot_math + "\n\n")
print("LLAMA CoT: \n" + response_llama_cotprompt_math + "\n\n")
print("Actual Answer: \n" + answers_math[index])

QUESTION: 
Q: A magician was selling magic card decks for N_00 dollars each . If he started with N_01 decks and by the end of the day he had N_02 left , how much money did he earn ?. The answer is 


LLAMA No-Prompt: 
In this question we are given that there were initially n cards in one deck, after some time they have sold out but now still remaining number of cards is not specified.
Now let us assume that at any point of time t = i then total no.of cards available will be x_i.
So on first day all cards were sold out so initial value of xi would be zero as it has been mentioned above.
After second day when only two cards remain our situation becomes:
2nd Day : All Cards Sold Out So Now Remaining No.Of Card = x_1 +x_2 ---(i)
3rd Day : Only One Left But There Are Two Cards Available So RemainigNoOfCard= X_1+x_2-x_3 ---(ii)
On third day if you add both equations together,
Summation Of Both Equations :
(x_1+X_2-X_4)+(X_1+(X_2- X_5)) +(X_1+X_2-->(iii)
Adding equation (III) to equation (II)

In [None]:
response_qwen_noprompt_math = generate_response(model_qwen, tokenizer_qwen, questions_math[index])
response_qwen_zeroshot_math = generate_response(model_qwen, tokenizer_qwen, questions_math[index] + "Let's think step by step. ")
response_qwen_cotprompt_math = generate_response(model_qwen, tokenizer_qwen, cot_prompt_math + questions_math[index])

In [None]:
print("QUESTION: \n" + questions_math[index] + "\n\n")
print("QWEN No-Prompt: \n" + response_qwen_noprompt_math + "\n\n")
print("QWEN Zero-Shot: \n" + response_qwen_zeroshot_math + "\n\n")
print("QWEN CoT: \n" + response_qwen_cotprompt_math + "\n\n")
print("Actual Answer: \n" + answers_math[index])

QUESTION: 
Q: A magician was selling magic card decks for N_00 dollars each . If he started with N_01 decks and by the end of the day he had N_02 left , how much money did he earn ?. The answer is 


QWEN No-Prompt: 
Q: A magician was selling magic card decks for N_00 dollars each . If he started with N_01 decks and by the end of the day he had N_02 left , how much money did he earn ?. The answer is 5. What are the values of N_01, N_02?

A)N_01=3,N_02=4
B)N_01=6,N_02=8
C)N_01=9,N_02=12
D)N_01=12,N_02=16

I have no clue what to do here.

Thanks.
This isn't a problem from any contest or book - it's just an example question in my math textbook.

So I don't know which choice you think should be right... but if this were on a test that would give me points based only off guessing, then C seems like it could work out since it has two options (and thus one point), while B doesn't.
And yes, I'm assuming they're asking "how many" instead of "what".

If you want some help solving problems 