In [1]:
import os
import pandas as pd
import random
random_seed = 42
import numpy as np

import openai
from openai import OpenAI

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

import torch
from sklearn.neighbors import NearestNeighbors

from tqdm import tqdm
from functools import partial

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#load mistral-7b api through together.ai
with open("./api-key-new") as f:
    OPENAI_API_KEY = f.read().strip()

client = OpenAI(api_key=OPENAI_API_KEY, base_url='https://api.together.xyz')

### 1. combine the dev and val data as training data

In [3]:
# # read and combine the data
# def combine_data(folder_path:str)-> pd.DataFrame:
#     dataset_types = ['dev', 'val']
#     csv_files = [file for file in os.listdir(folder_path) if any(file.endswith(f'{type}.csv') for type in dataset_types)]

#     # combine all data in the dataframe
#     dfs = []
#     for file in csv_files:
#         file_path = os.path.join(folder_path, file)
#         df = pd.read_csv(file_path, names=['text', 'A', 'B', 'C', 'D', 'answer'])
#         print(file_path, len(df))
#         dfs.append(df)

#     combined_df = pd.concat(dfs, ignore_index=True)

#     return combined_df


In [4]:
# math_training_data = combine_data("../mmlu/elementary_math/")
# len(math_training_data)

In [5]:
# moral_training_data = combine_data("../mmlu/moral_scenarios/")
# len(moral_training_data)

In [6]:
# math_training_data.to_csv("./math_train_data.csv")
# moral_training_data.to_csv("./moral_train_data.csv")

In [7]:
# df_math_test = pd.read_csv("./data/math_test_data.csv", names=['text', 'A', 'B', 'C', 'D', 'answer'])
# df_moral_test = pd.read_csv("./data/moral_test_data.csv", names=['text', 'A', 'B', 'C', 'D', 'answer'])

# df_math_test = df_math_test.sample(100, random_state=42)
# df_math_test.to_csv("./data/math_test_data_sample.csv")

# df_moral_test = df_moral_test.sample(100, random_state=42)
# df_moral_test.to_csv("./data/moral_test_data_sample.csv")

### 2. read data as dfs

In [8]:
df_math_train = pd.read_csv("./data/math_train_data.csv", index_col=0)
df_moral_train = pd.read_csv("./data/moral_train_data.csv", index_col=0)

In [9]:
df_math_test = pd.read_csv("./data/math_test_data_sample.csv", index_col=0)
df_moral_test = pd.read_csv("./data/moral_test_data_sample.csv", index_col=0)

### 3. Experimental setting

#### LLM setting

In [10]:
def send_prompt(MESSAGES: list) -> str:

    completion = client.chat.completions.create(
      model="mistralai/Mixtral-8x7B-Instruct-v0.1",
      # model="zephyr-chat",
      messages=MESSAGES,
      temperature=1.0,
      max_tokens=1,
      logprobs=True,
      # top_logprobs=4,
      # logit_bias=dict_logit_bias
    )

    return completion

In [11]:
def completion_print_and_save(completion, do_print:bool):
  top_logprobs = completion.choices[0].logprobs.token_logprobs

  max_probs = {'A': float('-inf'), 'B': float('-inf'), 'C': float('-inf'), 'D': float('-inf')}

  for token, logprob in zip(completion.choices[0].logprobs.tokens, top_logprobs):
      if token in max_probs and logprob > max_probs[token]:
          max_probs[token] = logprob
 
  max_option = max(max_probs, key=max_probs.get)

  if do_print:
    print("Output: \t", completion.choices[0].message.content)
    print("\nTop Logprobs:")
    for option, max_logprob in max_probs.items():
        print(f"{option} \t {max_logprob} \t {round(np.exp(max_logprob), 2)}")
    print("Max Probability Option:", max_option)

  return completion.choices[0].message.content

In [12]:
def process_item(prompting_method, row):
    messages = prompting_method(row)
    completion = send_prompt(messages[0])
    pred = completion_print_and_save(completion, do_print=False)
    return pred

In [13]:
def dynamic_process_item(dynamic_prompt):
    completion = send_prompt(dynamic_prompt[0])
    pred = completion_print_and_save(completion, do_print=False)
    return pred

In [14]:
def acc_calculator(df, pred_col_name):
    if df.isnull().values.any():
        print("Value missing!")
    else:
        total_samples = len(df)
        correct_predictions = sum(df['answer'] == df[pred_col_name].str.strip().str.upper())
        accuracy = correct_predictions / total_samples * 100
        print(f"accuracy: {accuracy:.2f}%")

    return accuracy

In [15]:
def inval_ans_calculator(df, pred_col_name):
    total_samples = len(df)
    valid_answers = df[df[pred_col_name].str.strip().str.upper().isin(['A', 'B', 'C', 'D'])]
    invalid_answers_count = total_samples - len(valid_answers)
    invalid_answers_percentage = (invalid_answers_count / total_samples) * 100

    print(f"invalid answer percentage: {invalid_answers_percentage:.2f}%")
    return invalid_answers_percentage

#### Dynamic Prepare

In [16]:
def get_embeddings(texts):
    tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=128) for text in texts]
    padded_texts = torch.tensor([tokens + [0] * (128 - len(tokens)) for tokens in tokenized_texts])

    with torch.no_grad():
        embeddings = model(padded_texts)[0][:, 0, :].numpy()

    return embeddings

In [17]:
# find the nearest samples in the training set and store their indices
def find_dynamic_samples(k, train_embeddings, test_embeddings):
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(train_embeddings)

    nearest_samples_indices = []
    for embedding in test_embeddings:
        _, indices = knn.kneighbors([embedding])
        nearest_index = indices[0][:k]
        nearest_samples_indices.append(nearest_index)

    return nearest_samples_indices

In [18]:
math_train_embeddings = get_embeddings(df_math_train['text'].to_list())
math_test_embeddings = get_embeddings(df_math_test['text'].to_list())

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [19]:
moral_train_embeddings = get_embeddings(df_moral_train['text'].to_list())
moral_test_embeddings = get_embeddings(df_moral_test['text'].to_list())

In [20]:
#store_learning_samples should be initialized as an empty list
#different from random, we conduct batch process here
def dynamic_few_shot_prompt_batch(df_train, df_test, train_embeddings, test_embeddings, sample_number, store_learning_samples):

  batch_messages = []

  nearest_samples_indices = find_dynamic_samples(sample_number, train_embeddings, test_embeddings)

  for item_index, sample_indices in enumerate(nearest_samples_indices):
    item = df_test.iloc[item_index]
    question = f"Question: {item.text}, A: {item.A}, B: {item.B}, C: {item.C}, D: {item.D}"

    other_prompt = ""

    for i, sample_index in enumerate(sample_indices):
      sample = df_train.iloc[sample_index]
      one_example = f"Example{i+1}：Question: {sample.text}, A: {sample.A}, B: {sample.B}, C: {sample.C}, D: {sample.D}\n Correct answer: {sample.answer}\n"
      other_prompt += one_example

    store_learning_samples.append(other_prompt)

    prompt_1 = [
      {
        "role": "system",
        "content": f"{other_prompt}\nNow answer the following question, and you must only output a single character from ('A', 'B', 'C', 'D')",
      },
      {
        "role": "user",
        "content": question,
      }
    ]
    messages = [prompt_1]
    batch_messages.append(messages)

  return batch_messages

### 4. Experiments

#### 4.1 Zero-shot learning

In [21]:
def zero_shot_prompt(item):
  DEFAULT_SYSTEM_PROMPT= "You are a helpful chatbot"
  #print(item.A)
  other_prompt = "Give out the correct answer for the following questions. You must only output a single character from ('A', 'B', 'C', 'D')."
  question = f"Question: {item.text}, A: {item.A}, B: {item.B}, C: {item.C}, D: {item.D}"
  prompt_1 = [
      {
        "role": "system",
        "content": other_prompt,
      },
      {
        "role": "user",
        "content": question,
      }
    ]
  messages = [prompt_1]
  return messages

##### Math

In [22]:
df_math_test['zero_shot_pred'] = df_math_test.apply(lambda row: process_item(zero_shot_prompt, row), axis=1)

zero_math_acc = acc_calculator(df_math_test, 'zero_shot_pred')
zero_math_invalid = inval_ans_calculator(df_math_test, 'zero_shot_pred')

# df_math_test.to_csv("zero_shot_math_results.csv", index=False)

accuracy: 16.00%
invalid answer percentage: 71.00%


##### Moral

In [23]:
df_moral_test['zero_shot_pred'] = df_moral_test.apply(lambda row: process_item(zero_shot_prompt, row), axis=1)

zero_moral_acc = acc_calculator(df_moral_test, 'zero_shot_pred')
zero_moral_invalid = inval_ans_calculator(df_moral_test, 'zero_shot_pred')

# df_moral_test.to_csv("zero_shot_moral_results.csv", index=False)

accuracy: 29.00%
invalid answer percentage: 22.00%


#### 4.2 Random Few-shot

In [24]:
#store_learning_samples should be initialized as an empty list
def random_few_shot_prompt(item, df_train, sample_number, store_learning_samples):
  question = f"Question: {item.text}, A: {item.A}, B: {item.B}, C: {item.C}, D: {item.D}"

  other_prompt = ""

  learning_samples = df_train.sample(sample_number)
  for i, (_, sample) in enumerate(learning_samples.iterrows()):
      one_example = f"Example{i+1}：Question: {sample.text}, A: {sample.A}, B: {sample.B}, C: {sample.C}, D: {sample.D}\n Correct answer: {sample.answer}\n"
      other_prompt += one_example

  store_learning_samples.append(other_prompt)

  prompt_1 = [
    {
      "role": "system",
      "content": f"{other_prompt}\nNow answer the following question, and you must only output a single character from ('A', 'B', 'C', 'D')",
    },
    {
      "role": "user",
      "content": question,
    }
  ]
  messages = [prompt_1]
  return messages

##### Math

In [25]:
math_random_one_samples = []

partial_few_shot_prompt = partial(random_few_shot_prompt, df_train=df_math_train, sample_number=1, store_learning_samples=math_random_one_samples)
df_math_test['random_one_pred'] = df_math_test.apply(lambda row: process_item(partial_few_shot_prompt, row), axis=1)

random_one_math_acc = acc_calculator(df_math_test, 'random_one_pred')
random_one_math_invalid = inval_ans_calculator(df_math_test, 'random_one_pred')

df_math_test['random_one_sample'] = math_random_one_samples

# df_math_test.to_csv("random_one_shot_math_results.csv", index=False)

accuracy: 35.00%
invalid answer percentage: 12.00%


##### Moral

In [26]:
moral_random_one_samples = []

partial_few_shot_prompt = partial(random_few_shot_prompt, df_train=df_moral_train, sample_number=1, store_learning_samples=moral_random_one_samples)
df_moral_test['random_one_pred'] = df_moral_test.apply(lambda row: process_item(partial_few_shot_prompt, row), axis=1)

random_one_moral_acc = acc_calculator(df_moral_test, 'random_one_pred')
random_one_moral_invalid = inval_ans_calculator(df_moral_test, 'random_one_pred')

df_moral_test['random_one_sample'] = moral_random_one_samples

# df_moral_test.to_csv("random_one_shot_moral_results.csv", index=False)

accuracy: 47.00%
invalid answer percentage: 1.00%


#### 4.3 Random Three-shot

##### Math

In [27]:
math_random_three_samples = []

partial_few_shot_prompt = partial(random_few_shot_prompt, df_train=df_math_train, sample_number=3, store_learning_samples=math_random_three_samples)
df_math_test['random_three_pred'] = df_math_test.apply(lambda row: process_item(partial_few_shot_prompt, row), axis=1)

df_math_test['random_three_sample'] = math_random_three_samples

random_three_math_acc = acc_calculator(df_math_test, 'random_three_pred')
random_three_math_invalid = inval_ans_calculator(df_math_test, 'random_three_pred')

# df_math_test.to_csv("random_three_shot_math_results.csv", index=False)

accuracy: 39.00%
invalid answer percentage: 13.00%


##### Moral

In [28]:
moral_random_three_samples = []

partial_few_shot_prompt = partial(random_few_shot_prompt, df_train=df_moral_train, sample_number=3, store_learning_samples=moral_random_three_samples)
df_moral_test['random_three_pred'] = df_moral_test.apply(lambda row: process_item(partial_few_shot_prompt, row), axis=1)

df_moral_test['random_three_sample'] = moral_random_three_samples

random_three_moral_acc = acc_calculator(df_moral_test, 'random_three_pred')
random_three_moral_invalid = inval_ans_calculator(df_moral_test, 'random_three_pred')

# df_moral_test.to_csv("random_three_shot_moral_results.csv", index=False)

accuracy: 48.00%
invalid answer percentage: 8.00%


#### Dynamic One-shot

##### Math

In [29]:
dynamic_one_shot_math_samples = []
dynamic_one_shot_math_prompts = dynamic_few_shot_prompt_batch(df_math_train, df_math_test, math_train_embeddings, math_test_embeddings, 1, dynamic_one_shot_math_samples)

dynamic_one_shot_math_results = []
for m in tqdm(dynamic_one_shot_math_prompts):
    result = dynamic_process_item(m)
    dynamic_one_shot_math_results.append(result)

df_math_test['dynamic_one_sample'] = dynamic_one_shot_math_samples
df_math_test['dynamic_one_pred'] = dynamic_one_shot_math_results

dynamic_one_math_acc = acc_calculator(df_math_test, 'dynamic_one_pred')
dynamic_one_math_invalid = inval_ans_calculator(df_math_test, 'dynamic_one_pred')

# df_math_test.to_csv("dynamic_one_shot_math_results.csv", index=False)

100%|██████████| 100/100 [02:46<00:00,  1.67s/it]

accuracy: 38.00%
invalid answer percentage: 10.00%





##### Moral

In [30]:
dynamic_one_shot_moral_samples = []
dynamic_one_shot_moral_prompts = dynamic_few_shot_prompt_batch(df_moral_train, df_moral_test, moral_train_embeddings, moral_test_embeddings, 1, dynamic_one_shot_moral_samples)

dynamic_one_shot_moral_results = []
for m in tqdm(dynamic_one_shot_moral_prompts):
    result = dynamic_process_item(m)
    dynamic_one_shot_moral_results.append(result)

df_moral_test['dynamic_one_sample'] = dynamic_one_shot_moral_samples
df_moral_test['dynamic_one_pred'] = dynamic_one_shot_moral_results

dynamic_one_moral_acc = acc_calculator(df_moral_test, 'dynamic_one_pred')
dynamic_one_moral_invalid = inval_ans_calculator(df_moral_test, 'dynamic_one_pred')

# df_moral_test.to_csv("dynamic_one_shot_moral_results.csv", index=False)

100%|██████████| 100/100 [02:48<00:00,  1.69s/it]

accuracy: 40.00%
invalid answer percentage: 1.00%





#### Dynamic Three-shot

##### Math

In [31]:
dynamic_three_shot_math_samples = []

dynamic_three_shot_math_prompts = dynamic_few_shot_prompt_batch(df_math_train, df_math_test, math_train_embeddings, math_test_embeddings, 3, dynamic_three_shot_math_samples)

In [32]:
dynamic_three_shot_math_results = []
for m in tqdm(dynamic_three_shot_math_prompts):
    result = dynamic_process_item(m)
    dynamic_three_shot_math_results.append(result)

100%|██████████| 100/100 [02:49<00:00,  1.70s/it]


In [33]:
df_math_test['dynamic_three_sample'] = dynamic_three_shot_math_samples
df_math_test['dynamic_three_pred'] = dynamic_three_shot_math_results

In [34]:
dynamic_three_math_acc = acc_calculator(df_math_test, 'dynamic_three_pred')
dynamic_three_math_invalid = inval_ans_calculator(df_math_test, 'dynamic_three_pred')

accuracy: 40.00%
invalid answer percentage: 13.00%


##### Moral

In [35]:
dynamic_three_shot_moral_samples = []

dynamic_three_shot_moral_prompts = dynamic_few_shot_prompt_batch(df_moral_train, df_moral_test, moral_train_embeddings, moral_test_embeddings, 3, dynamic_three_shot_moral_samples)

In [36]:
dynamic_three_shot_moral_results = []
for m in tqdm(dynamic_three_shot_moral_prompts):
    result = dynamic_process_item(m)
    dynamic_three_shot_moral_results.append(result)

100%|██████████| 100/100 [02:48<00:00,  1.68s/it]


In [37]:
df_moral_test['dynamic_three_sample'] = dynamic_three_shot_moral_samples
df_moral_test['dynamic_three_pred'] = dynamic_three_shot_moral_results

In [38]:
dynamic_three_moral_acc = acc_calculator(df_moral_test, 'dynamic_three_pred')
dynamic_three_moral_invalid = inval_ans_calculator(df_moral_test, 'dynamic_three_pred')

accuracy: 53.00%
invalid answer percentage: 1.00%


### 5. Save results

In [39]:
df_math_test.to_csv("math_RQ1_results.csv", index=False)

In [40]:
df_moral_test.to_csv("moral_RQ1_results.csv", index=False)