## Prepare KB for Indexing

In [1]:
import pandas as pd
from string import Template

kb = pd.read_csv("data/generated_knowledge/all-knowledge.csv")

doc_template = Template(
    "$title\nSummary: $summary\nDescription: $context\n" 
)

documents = []
for i, row in kb.iterrows():
    doc = doc_template.substitute(
        {
            "title": row["title"],
            "summary": row["description"],
            "context": row["linearized_paragraph"]
        }
    )
    documents.append(doc)

## Setup Vector DB

In [None]:
import chromadb
from chromadb.utils import embedding_functions
embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="multi-qa-mpnet-base-dot-v1"
)

client = chromadb.PersistentClient(path="kb-cache/")
db = client.create_collection(
    name="causal-kb",
    embedding_function=embedder,
    metadata={
        "hnsw:space": "ip",
        "embedding": "multi-qa-mpnet-base-dot-v1"
    }
)

In [None]:
db.add(
    documents=documents,
    metadatas=kb.to_dict(orient="records"),
    ids=kb["uid"].tolist()
)

## RAG Pipeline

### LLM Query Methods

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch 
from openai import OpenAI
import re
from string import Template
from tenacity import retry, stop_after_attempt, wait_fixed

class QueryEngine:

    def __init__(
            self,
            model_alias: str,
            template: Template,
            quant_config: dict = {}, 
        ):
        self.model_alias = model_alias
        key = ""

        if "gpt" not in model_alias:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_alias,
                trust_remote_code=True
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                model_alias,
                trust_remote_code=True,
                torch_dtype="auto",
                quantization_config=quant_config
            )
        else:
            self.client = OpenAI(
                api_key=key
            )
            
        self.prompt_template = template
    
    def extract_output(self, text: str) -> str:
        text = text.lower().strip()
        
        # Define the regex pattern to capture the output
        pattern = re.compile(r'output:\s*(.*)', re.IGNORECASE)

        # Search for the pattern in the text
        match = pattern.search(text)

        # Extract and return the output, if a match is found
        if match:
            return match.group(1).strip()

        # Return an empty string if no match is found
        return ""

    @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
    def query_gpt(self, prompt: str) -> str:
        prompt = self.prompt_template.safe_substitute({"input": prompt})
        response = self.client.chat.completions.create(
            model = "gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert in causal reasoning, logical reasoning, and commonsense question-answering. Do not provide an intro or concluding remarks in your response. Be as concise as you can be when responding. "
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=1,
            max_tokens=20,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].message.content

    
    def query_llm(self, text: str) -> str:
        prompt = self.prompt_template.safe_substitute({"input": text})
        encoded_prompt = self.tokenizer(prompt, return_tensors="pt")
        
        #with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
        output = self.model.generate(**encoded_prompt, max_new_tokens=20)
        output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
        output_text = output_text.split("[\INST]")[1]
        #output_text = output_text.split("[/INST]")[1]
        return self.extract_output(output_text)
    
    def query(self, query: str):
        if "gpt" in self.model_alias:
            return self.query_gpt(query)
        return self.query_llm(query)

In [6]:
phi_template = Template(
    "Instruct: Answer the question provided the scenario below. Do not provide an intro or concluding remarks in your response. Do not provide an explanation. Just provide an answer. For multiple-choice return the letter and answer only. $input\nOutput:"
)

gpt_template = Template(
    'Answer the question below. Do not provide an explanation. Provide both the letter and answer option. Use the prefix "output:" and then provide the answer. \n $input'
)

gpt_template_ropes = Template(
"""
Answer the question below. Do not provide an explanation. Just provide the answer and nothing more.
Example:
Sitution: Bearland had a small population, while deerland had a large population. Both peoples lived mostly off the land.
Question: which country had less food per individual?
Output: bearland

Scenario:
$input
Output:
"""
)
mistral_template = Template(
    "<s>[INST]Answer the question provided the scenario below. Do not provide an intro or concluding remarks in your response. Do not provide an explanation. Just provide an answer. For multiple-choice return the letter and answer only. $input\n[/INST]\nOutput:"
) 
mistral_template_ropes = Template(
     """<s>[INST]Answer the question below. Do not provide an explanation.\nExample:\nsituation: deerland had a small population, while bearland had a large population. both peoples lived mostly off the land.\n question: which country had less food per individual?\nOutput: bearland\nScenario:\n$input\n[/INST]Output:"""
)

llama_template = Template(
"""
<s>[INST] <<SYS>>
You are an expert in causal reasoning, logical reasoning, and commonsense question-answering. Do not provide an intro or concluding remarks in your response.  Be as concise as you can be when responding. 
<</SYS>>
Task: 
Answer the question provided the scenario below. Do not provide an intro or concluding remarks in your response. Do not provide an explanation. Just provide an answer. For multiple-choice return the correct answer.\n

Example:
What is the capital of France?
Options: 
a) Paris b) London c) Berlin d) Rome
Output:
a) Paris

Task Input:
$input
[/INST]    
Output:
"""
)


llama_ropes_template = Template(
"""
<s>[INST] <<SYS>>
You are an expert in causal reasoning, logical reasoning, and commonsense question-answering. Do not provide an intro or concluding remarks in your response.  Be as concise as you can be when responding. 
<</SYS>>
Answer the question below. Do not provide an explanation.

Example:
situation: 
deerland had a small population, while bearland had a large population. both peoples lived mostly off the land.\
question: which country had less food per individual?
Output: bearland

Scenario:
$input:
[/INST]    
Output:
"""
)


#alias = "microsoft/phi-2"
alias = "gpt-3.5-turbo"
#alias = "mistralai/Mistral-7B-Instruct-v0.2"
#alias = "NousResearch/Llama-2-13b-chat-hf"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   #bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
engine = QueryEngine(alias, mistral_template, nf4_config)

In [None]:
def get_choice(input_string):
    input_string = input_string.strip().lower()
    
    if input_string.startswith('a') or input_string.startswith('a)'):
        return 'a'
    elif input_string.startswith('b') or input_string.startswith('b)'):
        return 'b'
    elif input_string.startswith('c)') or input_string.startswith('c'):
        return 'c'
    elif input_string.startswith('d)') or input_string.startswith('d'):         
        return 'd'
    else:
        return 'a'

In [13]:
import chromadb
from datasets import load_from_disk
from tqdm.notebook import tqdm 
from chromadb.utils import embedding_functions
import pandas as pd 
from transformers.utils import logging

logging.set_verbosity_error()

client = chromadb.PersistentClient(path="kb-cache/")
embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="multi-qa-mpnet-base-dot-v1"
)
db = client.get_collection("causal-kb", embedding_function=embedder)

tasks = [
   #"copa"
   #"ropes", 
   #"wiqa"
   #"ecare", "anli", 
   #"cosmosqa", 
   "ropes", 
   #"wiqa"
]
log_path = "experiment_results/gpt-3.5"
for task in tasks:
    ds = load_from_disk(f"data/calm-bench/datasets/{task}")
    train = ds.filter(lambda x: x["split"] == "train") 
    test = ds.filter(lambda x: x["split"] == "test")    

    updated_rows = []
    for i, row in tqdm(enumerate(test), total=len(test)):
        
        #example = f"Context:\n{row['context']}\nQuestion:\n{row['question']}\nOptions: a) {row['option1']} b) {row['option2']} c) {row['option3']} d) {row['option4']}"
        example = f"Background:\n{row['background']}\nSituation:\n{row['situation']}\nQuestion: {row['question']}"

        baseline_pred = engine.query(example)
        row["baseline_pred"] = baseline_pred
        
        # Generate knowledge-augmented prediction
        top_context = db.query(query_texts=[row["input"]], n_results=1)["documents"][0][0]
        prompt_with_context = f"Background Knowledge:\n{top_context}\n{example}"
        know_pred = engine.query(prompt_with_context)
        row["know_pred"] = know_pred
        
        # Calculate accuracy
        if task != "ropes":
            row["baseline_choice"] = get_choice(baseline_pred)
            row["baseline_is_correct"] = int(row["baseline_choice"] == row["output_label"])

            row["know_choice"] = get_choice(know_pred)
            row["know_pred_is_correct"] = int(row["know_choice"] == row["output_label"])

        else:
            row["baseline_is_correct"] = int(baseline_pred.lower().strip() == row["output_text"].lower().strip())
            row["know_pred_is_correct"] = int(know_pred.lower().strip() == row["output_text"].lower().strip())
        updated_rows.append(row)

    # test_df = pd.DataFrame(updated_rows)
    # test_df.to_csv(f"{log_path}/{task}-results-2.csv", index=False)

    # with open(f"{log_path}/overall_results.txt", "a") as f:
    #     f.write(f"Task: {task}\n")
    #     f.write(f"Baseline Accuracy: {test_df['baseline_is_correct'].mean()}\n")
    #     f.write(f"Knowledge-Augmented Accuracy: {test_df['know_pred_is_correct'].mean()}\n")
    #     f.write("\n")




  0%|          | 0/500 [00:00<?, ?it/s]

In [23]:
df = pd.DataFrame(updated_rows)
df[["baseline_is_correct", "know_pred_is_correct"]].mean().round(2)

ropes_test = pd.read_csv("data/calm-bench/test-sets/ropes.csv")
ropes_test = ropes_test[["input", "domain_label", "question_type"]]

df = pd.concat([df, ropes_test], axis=1, join="inner")
df = df[["task",  "domain_label", "question_type", "input", "output_text", "know_pred", "know_pred_is_correct", "baseline_is_correct", "baseline_pred"]]

df.sample(100, random_state=42)[["baseline_is_correct", "know_pred_is_correct"]].mean().round(2)

baseline_is_correct     0.40
know_pred_is_correct    0.45
dtype: float64

In [30]:
s = pd.read_csv("experiment_results/gpt-3.5/sample.csv")
s[["baseline_is_correct", "know_pred_is_correct"]].mean().round(2)

baseline_is_correct     0.54
know_pred_is_correct    0.65
dtype: float64

## Wiqa Eval

In [None]:
import chromadb
from datasets import load_from_disk
from tqdm.notebook import tqdm 
from chromadb.utils import embedding_functions
import pandas as pd 
from transformers.utils import logging
from ast import literal_eval
import numpy as np

logging.set_verbosity_error()

client = chromadb.PersistentClient(path="kb-cache/")
embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="multi-qa-mpnet-base-dot-v1"
)
db = client.get_collection("causal-kb", embedding_function=embedder)

log_path = "experiment_results/phi-2"

test = pd.read_csv("data/calm-bench/test-sets/wiqa.csv")


updated_rows = []
for i, row in tqdm(test.iterrows(), total=len(test)):
    
    if row["updated_answer"] == "more":
        gold_choice = "a"
    elif row["updated_answer"] == "less":
        gold_choice = "b"
    else:
        gold_choice = "c"
    
    steps = literal_eval(row["question_para_step"])
    context = [". ".join(tup) for tup in zip(np.arange(1, len(steps)+1).astype(str), steps) ]
    context = "\n".join(context)

    example = f"Context:\n{context}\nQuestion: {row['question']}\nOptions: a) more b) less c) no effect"
    
    #baseline_pred =  engine.extract_output(engine.query(example))
    baseline_pred =  engine.query(example)
    row["baseline_pred"] = baseline_pred
    row["baseline_choice"] = get_choice(baseline_pred)
    row["baseline_is_correct"] = int(row["baseline_choice"] == gold_choice)


    # Generate knowledge-augmented prediction
    top_context = db.query(query_texts=[example], n_results=1)["documents"][0][0]
    prompt_with_context = f"{top_context}\n{example}"
    #know_pred = engine.extract_output(engine.query(prompt_with_context))
    know_pred = engine.query(prompt_with_context)
    row["know_pred"] = know_pred
    row["know_choice"] = get_choice(know_pred)
    row["know_is_correct"] = int(row["know_choice"] == gold_choice)

    updated_rows.append(row)   
    
# test_df = pd.DataFrame(updated_rows)
# test_df.to_csv(f"{log_path}/{task}-results.csv", index=False)

# with open(f"{log_path}/overall_results.txt", "a") as f:
#     f.write(f"Task: {task}\n")
#     f.write(f"Baseline Accuracy: {test_df['baseline_is_correct'].mean()}\n")
#     f.write(f"Knowledge-Augmented Accuracy: {test_df['know_pred_is_correct'].mean()}\n")
#     f.write("\n")

In [None]:
test_df = pd.DataFrame(updated_rows)

test_df[["baseline_is_correct", "know_is_correct"]].mean().round(2)

In [None]:
test_df.to_csv(f"{log_path}/wiqa-results.csv", index=False)


In [1]:
(901 + 146) / (3868 + 1041)

0.21328172743939702