In [14]:
import json

with open("./data/input.json", "r") as f:
    data = json.load(f)

with open("./data/sample.json", "r") as f:
    samples = json.load(f)

In [15]:
def generate_input_str(data):
    input_str = ["Dialogue:"]
    for d in data["dialogue"]:
        utter = f"{d['speaker']}: {d['utterance']}"
        input_str.append(utter)
    summary = f"Summary:\n{data['summary']}"
    input_str.append(summary)

    return "\n".join(input_str)

def generate_sample_str(data):
    input_str = generate_input_str(data)
    return f"{input_str}\nAnswer:\n{data['answer']}\n"

def generate_zeroshot_prompt(input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. A summary is accurate when all of its content are consistent with the dialogue. In contrast, if there is a part of the summary that is inconsistent or not supported by the dialogue, even if its only one part, then the summary is inaccurate. Answer with "Yes" or "No".
    
{generate_input_str(input)}
Answer:
"""

def generate_fewshot_prompt(samples, input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. A summary is accurate when all of its content are consistent with the dialogue. In contrast, if there is a part of the summary that is inconsistent or not supported by the dialogue, even if its only one part, then the summary is inaccurate. Answer with "Yes" or "No".

{generate_input_str(samples[0])}
Answer: Yes

{generate_input_str(samples[1])}
Answer: No

{generate_input_str(input)}
Answer:
"""

def generate_zeroshot_cot_prompt(input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. A summary is accurate when all of its content are consistent with the dialogue. In contrast, if there is a part of the summary that is inconsistent or not supported by the dialogue, even if its only one part, then the summary is inaccurate. Answer with "Yes" or "No".
    
{generate_input_str(input)}

Answer:
Let's think step by step"""

def generate_fewshot_cot_prompt(samples, input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. A summary is accurate when all of its content are consistent with the dialogue. In contrast, if there is a part of the summary that is inconsistent or not supported by the dialogue, even if its only one part, then the summary is inaccurate. Answer with "Yes" or "No".

{generate_sample_str(samples[0])}
{generate_sample_str(samples[1])}
{generate_input_str(input)}
Answer:
"""

def generate_analogical_prompt(input):
    return f"""Your will be given a dialogue and a summary. Your task is to determine whether the summary accurately reflects the dialogue or not. A summary is accurate when all of its content are consistent with the dialogue. In contrast, if there is a part of the summary that is inconsistent or not supported by the dialogue, even if its only one part, then the summary is inaccurate. When presented with the dialogue and summary, recall relevant dialogues and summaries as examples. Afterward, proceed with the answer "Yes" or "No".

# Instructions:
## Relevant Problems:
Recall two examples of dialogues and summaries that are relevant to the initial problem. The examples should be distinct from each other and from the initial problem. For each sample:
- After "Dialogue: ", describe the dialogue
- After "Summary: ", describe the summary
- After "Answer: ", explain the solution on determining whether the summary accurately reflects the dialogue or not.
## Solve the initial problem:
{generate_input_str(input)}
Answer:
"""

def generate_ps_prompt(input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. A summary is accurate when all of its content are consistent with the dialogue. In contrast, if there is a part of the summary that is inconsistent or not supported by the dialogue, even if its only one part, then the summary is inaccurate. Answer with "Yes" or "No".
    
{generate_input_str(input)}

Answer:
Let's first understand the problem and devise a plan to solve the problem. Then let's carry out the plan and solve the problem step by step"""

def generate_psplus_prompt(input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. A summary is accurate when all of its content are consistent with the dialogue. In contrast, if there is a part of the summary that is inconsistent or not supported by the dialogue, even if its only one part, then the summary is inaccurate. Answer with "Yes" or "No".
    
{generate_input_str(input)}

Answer: 
Let's first read and understand the dialogue carefully, extract facts from each of the dialogue utterances, read the summary, and last compare your understandings of the summary with facts from the dialogue. Devise a plan to answer the instruction, then carry out the plan to solve the problem step by step."""

In [19]:
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

llm = LlamaCPP(
    model_path="./../simple-rag/llm/zephyr-7b-beta.Q4_K_M.gguf",
    temperature=0.1,
    max_new_tokens=2048,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    verbose=True
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [20]:
from transformers import AutoTokenizer

checkpoint = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [21]:
import os

from tqdm import tqdm

model = "zephyr-7b-beta"
os.makedirs(f"./results/verify_summary_v2/{model}", exist_ok=True)

results = {
    "zeroshot": [],
    "fewshot": [],
    "zeroshot_cot": [],
    "fewshot_cot": [],
    "analogical": [],
    "ps": [],
    "psplus": [],
}

for d in tqdm(data):
    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"{generate_zeroshot_prompt(d)}"}], 
        tokenize=False, 
        add_generation_prompt=True
    )
    results["zeroshot"].append({
        "prompt": prompt,
        "response": llm.complete(prompt).text
    })
    
    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"{generate_fewshot_prompt(samples, d)}"}], 
        tokenize=False, 
        add_generation_prompt=True
    )
    results["fewshot"].append({
        "prompt": prompt,
        "response": llm.complete(prompt).text
    })

    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"{generate_zeroshot_cot_prompt(d)}"}], 
        tokenize=False, 
        add_generation_prompt=True
    )
    results["zeroshot_cot"].append({
        "prompt": prompt,
        "response": llm.complete(prompt).text
    })

    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"{generate_fewshot_cot_prompt(samples, d)}"}], 
        tokenize=False, 
        add_generation_prompt=True
    )
    results["fewshot_cot"].append({
        "prompt": prompt,
        "response": llm.complete(prompt).text
    })

    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"{generate_analogical_prompt(d)}"}], 
        tokenize=False, 
        add_generation_prompt=True
    )
    results["analogical"].append({
        "prompt": prompt,
        "response": llm.complete(prompt).text
    })

    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"{generate_ps_prompt(d)}"}], 
        tokenize=False, 
        add_generation_prompt=True
    )
    results["ps"].append({
        "prompt": prompt,
        "response": llm.complete(prompt).text
    })

    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"{generate_psplus_prompt(d)}"}], 
        tokenize=False, 
        add_generation_prompt=True
    )
    results["psplus"].append({
        "prompt": prompt,
        "response": llm.complete(prompt).text
    })

for prompt_type, result in results.items():
    with open(f"./results/verify_summary_v2/{model}/{prompt_type}.json", "w") as f:
        json.dump(result, f, indent=4)

  0%|          | 0/20 [00:00<?, ?it/s]Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
  5%|▌         | 1/20 [02:53<55:02, 173.84s/it]Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
 10%|█         | 2/20 [11:36<1:53:37, 378.77s/it]Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
 15%|█▌        | 3/20 [27:50<3:04:22, 650.76s/it]Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-ma

In [11]:
import json
import glob

for fpath in glob.glob("./results/verify_summary_v1/*/*.json"):
    with open(fpath, "r") as f:
        result = json.load(f)
    
    found = False
    for idx, r in enumerate(result):
        try:
            if (r["is_accurate"] == r["predict"]) != r["verdict"]:
                found = True
                print(idx)
                break
        except:
            raise KeyError(fpath)
    if found:
        print(fpath)
        break

In [24]:
all_verdict = {}

for dir in glob.glob("./results/verify_summary_v2/*/"):
    model_type = dir.split("\\")[1]
    if "beta" not in model_type:
        verdict = {}
        for fpath in glob.glob(f"{dir}*.json"):
            prompt_type = fpath.split(dir)[-1].split(".")[0]
            with open(fpath, "r") as f:
                result = json.load(f)
            
            TP = FN = FP = TN = EXP = NO_ANS = PLAN = SAMPLES = 0
            for r in result:
                if r["is_accurate"] == True and r["predict"] == True:
                    TP += 1
                elif r["is_accurate"] == True and r["predict"] == False:
                    FN += 1
                elif r["is_accurate"] == False and r["predict"] == True:
                    FP += 1
                elif r["is_accurate"] == False and r["predict"] == False:
                    TN += 1
                
                if r["explanation"]: EXP += 1
                if r["no_answer"]: NO_ANS += 1
                if r["plan_and_steps"]: PLAN += 1
                if r["relevant_samples"]: SAMPLES += 1

            verdict[prompt_type] = {
                "TP": TP,
                "FN": FN,
                "FP": FP,
                "TN": TN,
                "accuracy": ((TP + TN) / (TP + FN + FP + TN)) * 100.0,
                "explanation": EXP,
                "no_answer": NO_ANS,
                "plan_and_steps": PLAN,
                "relevant_samples": SAMPLES
            }
        
        all_verdict[model_type] = verdict

with open("./results/verify_summary_v2/verdict.json", "w") as f:
    json.dump(all_verdict, f, indent=4)