In [2]:
import re
import openai
import json
import numpy as np
import time
from datasets import load_dataset
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score
import random
random.seed(42)

with open("../../openai_api_key.txt", "r") as f:
    openai.api_key = f.read().strip()

In [2]:
# system_message = \
# """
# ### Instruction:\n
# Please verify whether the reference can support the answer to the question.
# Options: 'attributable' or 'not attributable'.
# Note that, there are usually two criteria to judge the answer: (i) informativeness: whether the
# claim provides a clear and direct answer that can clearly answer the question, and (ii) 
# attributability: whether the explanation is attributable to the references, which means there are no factual errors
# in the claim that is not supported or conflict to the references.
# Your task is to judge the attributability, instead of the informativeness, and you must not confuse them.
# \n\n
# ### Input:
# """
instruction = "### Instruction:\nPlease verify whether the reference can support the claim to the question. Options: 'attributable' or 'not attributable'.\n"
input = "### Input:\nQuestion: {}\n\nClaim: {}\n\nReference: {}"

In [12]:
# test_subset_name = "test"
test_subset_name = "test_all_subset_balanced"
model = "gpt-3.5-turbo"
# model = "gpt-3.5-turbo-16k"

# mode = "few_shot"
mode = "zero_shot"

# test_data = [row for row in load_dataset("osunlp/AttributionBench", split=test_subset_name)][:1000]
test_data = [json.loads(l) for l in open("../../data/hf_data/test_all_subset_balanced.jsonl")]

# TASK_PROMPTS = {p['task_name']:[p['prompt_template'],p['input_template'],p['label_map']] for p in json.load(open("task_prompts.json"))}

In [13]:
test_data[1]

{'question': 'who wrote the theme song for mission impossible',
 'claim': 'Lalo Schifrin',
 'claim_raw_string': 'Lalo Schifrin',
 'response': 'Lalo Schifrin',
 'references': ['Title: Mission: Impossible (1966 TV series)\nSection: Inspirations and innovations\n\nMission: Impossible is still recognized for its innovative use of music. Composer Lalo Schifrin wrote several distinctive pieces for the series. The visual cuts in the main title sequence were timed to the beats and measures of the theme tune—written in (unusual) 54 time—while an animated burning fuse moved across the screen. Most episodes included fairly long dialogue-free sequences showing the team members—particularly electronics expert Barney Collier—making technical preparations for the mission, usually to the accompaniment of another easily recognizable tune called "The Plot." Lalo Schifrin also wrote a theme piece for each main character and the sound track for each episode incorporated variations of these throughout. Eve

In [14]:
def format_prompt(example, mode="zero_shot"):
    task_prompt = "### Instruction:\nPlease verify whether the reference can support the claim to the question. Options: 'attributable' or 'not attributable'.\n"
    input_template = "### Input:\nQuestion: {}\n\nClaim: {}\n\nReference: {}\n\n### Response:"

    # "input_template": "Question: {}\n\nClaim: {}\n\nResponse: {}\n\nReference: {}\n",
    query = example['question'] if example['question'] and example['question'] not in ["nan", "", None] else ""
    answer = example['claim'] if example['claim'] and example['claim'] not in ["nan", "", None] else ""
    response = example['response'] if example['response'] and example['response'] not in ["nan", "", None] else ""
    documents_concatenation = "\n\n\n".join(example["references"])
    input = input_template.format(query, answer, documents_concatenation)
    
    prompt = "{}{}".format(task_prompt, input)
    
    return prompt
    # return prompt if len(prompt) < 40000 else prompt[:40000]

In [15]:
print(format_prompt(test_data[1]))

### Instruction:
Please verify whether the reference can support the claim to the question. Options: 'attributable' or 'not attributable'.
### Input:
Question: who wrote the theme song for mission impossible

Claim: Lalo Schifrin

Reference: Title: Mission: Impossible (1966 TV series)
Section: Inspirations and innovations

Mission: Impossible is still recognized for its innovative use of music. Composer Lalo Schifrin wrote several distinctive pieces for the series. The visual cuts in the main title sequence were timed to the beats and measures of the theme tune—written in (unusual) 54 time—while an animated burning fuse moved across the screen. Most episodes included fairly long dialogue-free sequences showing the team members—particularly electronics expert Barney Collier—making technical preparations for the mission, usually to the accompaniment of another easily recognizable tune called "The Plot." Lalo Schifrin also wrote a theme piece for each main character and the sound track fo

In [16]:
import traceback

def get_attr_from_chatgpt(prompt, model="gpt-3.5-turbo"):
    messages=[
        # {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ]
    while True:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=0,
                top_p=0.9,
                max_tokens=512,
                n=1
            )
            # print(response)
            time.sleep(1)
            return response['choices'][0]['message']['content'].strip()
        except Exception as e:
            print("Error:", str(e))
            # 打印详细的堆栈跟踪信息
            traceback.print_exc()
            time.sleep(5)

In [17]:
prompt = format_prompt(test_data[1], mode=mode)
print(len(prompt))
print(prompt)

1525
### Instruction:
Please verify whether the reference can support the claim to the question. Options: 'attributable' or 'not attributable'.
### Input:
Question: who wrote the theme song for mission impossible

Claim: Lalo Schifrin

Reference: Title: Mission: Impossible (1966 TV series)
Section: Inspirations and innovations

Mission: Impossible is still recognized for its innovative use of music. Composer Lalo Schifrin wrote several distinctive pieces for the series. The visual cuts in the main title sequence were timed to the beats and measures of the theme tune—written in (unusual) 54 time—while an animated burning fuse moved across the screen. Most episodes included fairly long dialogue-free sequences showing the team members—particularly electronics expert Barney Collier—making technical preparations for the mission, usually to the accompaniment of another easily recognizable tune called "The Plot." Lalo Schifrin also wrote a theme piece for each main character and the sound tra

In [18]:
# prompt = format_prompt(test_data[91], prompt_type='attribution-binary-with-whole-response-and-informativeness-definition', input_has_query=True, input_has_response=True, mode=mode)
prompt = format_prompt(test_data[1], mode=mode)
response = get_attr_from_chatgpt(prompt, model)
print("prompt:", prompt)
print("response:", response)
print("ground_truth", test_data[1]["attribution_label"])

KeyboardInterrupt: 

In [28]:
output_file = "./{}_{}_{}_old_document_result.json".format(model, test_subset_name, mode)
print(output_file)

./gpt-4_expertqa_stanford_strlen_within20k_random500_zero_shot_old_document_result.json


In [10]:

#downsample to save cost

# random.shuffle(test_data)
# test_data = test_data[:500]

prompt_list = []

# for task_name in ["attribution-binary-with-whole-response-and-informativeness-definition"]:
for task_name in ["attribution-binary-with-definition"]:
    res_key = '{}.eval.{}'.format(model,task_name)

    for example in tqdm(test_data):
        prompt = format_prompt(example, prompt_type=task_name, webpage_documents=True, input_has_query=True, input_has_response=False, mode=mode)
        prompt_list.append(prompt)
        example[res_key] = get_attr_from_chatgpt(prompt,model)

    json.dump(test_data, open(output_file,'a'))

# print(np.mean([len(x) for x in prompt_list]))
# print(np.std([len(x) for x in prompt_list]))
# print(np.median([len(x) for x in prompt_list]))
# print(np.max([len(x) for x in prompt_list]))
# print(np.min([len(x) for x in prompt_list]))

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
def read_few_shot_demo(prompt_type):
    demo_file_name = {
        "attribution-no-definition": "demo_attr.txt",
        "attribution-with-definition": "demo_attr.txt",
        "fact-checking": "demo_fact-checking.txt",
        "nli": "demo_NLI.txt",
        "summarization": "demo_sum.txt"
    }
    with open(f"../few-shot-demo/{demo_file_name[prompt_type]}") as rf:
        demo_str = rf.read()
        rf.close()
    return demo_str

In [None]:
def format_prompt(example, prompt_type = "attribution-binary-with-definition", input_has_query = True, mode="zero_shot"):
    task_prompt, input_template, _ = TASK_PROMPTS[prompt_type]
    if input_has_query:
        query = example['question'] if example['question'] and example['question'] not in ["nan",""] else ""
        answer = example['claim'] if example['claim'] and example['claim'] not in ["nan",""] else ""
        input = input_template.format("Question: " + query + " " + "Answer: " + answer, "Reference: " + example['documents_concatenation'])

    else:
        answer = example['claim'] if example['claim'] and example['claim'] not in ["nan",""] else ""
        input = input_template.format("Answer: " + answer, "Reference: " + example['documents_concatenation'])
    
    if mode=="few_shot":
        demo_str = read_few_shot_demo(prompt_type)
        prompt = "\n{}\n{}\n\n### Input: \n{}\n### Response:".format(task_prompt,demo_str,input)
    else:
        prompt = "\n{}\n\n### Input: \n{}\n### Response:".format(task_prompt,input)
    
    return prompt

In [None]:
def get_attr_from_chatgpt(prompt, model="gpt-3.5-turbo"):
    messages=[
        # {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ]
    while True:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=0,
                top_p=0.9,
                max_tokens=512,
                n=1
            )
            # print(response)
            time.sleep(0.75)
            return response['choices'][0]['message']['content'].strip()
        except:
            print("error")
            time.sleep(5)

In [None]:
prompt = format_prompt(test_data[91],'attribution-binary-with-definition',input_has_query=True,mode=mode)
response = get_attr_from_chatgpt(prompt,model)
print(prompt)
print(response)

In [None]:
output_file = "./{}_{}_{}_result.json".format(model,test_subset_name,mode)
print(output_file)

In [None]:

#downsample to save cost

# random.shuffle(test_data)
# test_data = test_data[:500]

for task_name in ["attribution-binary-with-definition"]:
    res_key = '{}.eval.{}'.format(model,task_name)

    for example in tqdm(test_data):
        prompt = format_prompt(example,prompt_type=task_name,mode=mode)
        example[res_key] = get_attr_from_chatgpt(prompt,model)

    json.dump(test_data, open(output_file,'a'))

In [None]:
test_data[400]

In [11]:
def extract_pred_label(prediction, prompt_type="attribution-binary-with-whole-response-and-informativeness-definition"):
    label_map = TASK_PROMPTS[prompt_type][-1]
    label_regex = r"|".join(list(label_map.keys()))

    pred_label = re.search(label_regex, prediction, re.IGNORECASE).group() if re.search(
            label_regex,
            prediction, re.IGNORECASE) is not None else 'None'

    pred_label = label_map[pred_label.lower()] if pred_label.lower() in label_map else "None"

    return pred_label

# extract_pred_label(test_data[1][res_key], prompt_type="attribution-binary-with-definition")

In [12]:
def evaluate_confusion_matrix(confusion_matrix):
    num_classes = confusion_matrix.shape[0]
    precision = np.zeros(num_classes)
    recall = np.zeros(num_classes)
    f1 = np.zeros(num_classes)

    for i in range(num_classes):
        true_positives = confusion_matrix[i, i]
        false_positives = np.sum(confusion_matrix[:, i]) - true_positives
        false_negatives = np.sum(confusion_matrix[i, :]) - true_positives

        precision[i] = true_positives / (true_positives + false_positives)
        recall[i] = true_positives / (true_positives + false_negatives)
        f1[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i])

    micro_true_positives = np.sum(np.diag(confusion_matrix))
    micro_false_positives = np.sum(confusion_matrix, axis=0) - np.diag(confusion_matrix)

    micro_f1 = micro_true_positives / (micro_true_positives + np.sum(micro_false_positives))
    macro_f1 = np.mean(f1)

    return precision, recall, f1, micro_f1, macro_f1

In [22]:
output_file

'./gpt-4_expertqa_stanford_strlen_within20k_random500_zero_shot_new_document_result.json'

In [None]:
json.load(open(output_file))

In [29]:
# for task_name in ["attribution-binary-with-whole-response-and-informativeness-definition"]:
for task_name in ["attribution-binary-with-definition"]:
    res_key = '{}.eval.{}'.format(model, task_name)
    pred_labels = [extract_pred_label(example[res_key], prompt_type=task_name) for example in json.load(open(output_file))]
    true_labels = [example['attribution_label'] for example in json.load(open(output_file))]
    acc = accuracy_score(true_labels, pred_labels)
    conf_matrix = confusion_matrix(true_labels, pred_labels, labels=["attributable", "not attributable"])

    precision, recall, f1, micro_f1, macro_f1 = evaluate_confusion_matrix(conf_matrix)

    print(task_name)
    print(conf_matrix)
    print("Accuracy:", acc)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)
    print("micro_f1:", micro_f1)
    print("macro_f1:", macro_f1)

attribution-binary-with-definition
[[597 165]
 [102 136]]
Accuracy: 0.733
Precision: [0.85407725 0.45182724]
Recall: [0.78346457 0.57142857]
F1: [0.81724846 0.50463822]
micro_f1: 0.733
macro_f1: 0.6609433394414327


In [None]:
len([example for example in json.load(open(output_file)) if example["src_dataset"] in ["hagrid_dev", "hagrid_train"]])

In [None]:
print(len([example for example in json.load(open(output_file)) if example["attribution_label"] == "attributable"]))
print(len([example for example in json.load(open(output_file)) if example["attribution_label"] == "not attributable"]))

In [None]:
train = 108
dev = 166
hagrid_train_acc = 0.81
hagrid_dev_acc = 0.75
hagrid_acc = (hagrid_train_acc * train + hagrid_dev_acc * dev) / (train + dev)
print(hagrid_acc)

In [26]:
expertqa_old = json.load(open("/research/nfs_sun_397/li.14042/projects/attribution-eval/code/gpt-4_expertqa_strlen_within20k_random500_zero_shot_old_document_result.json"))
stanford_old = json.load(open("/research/nfs_sun_397/li.14042/projects/attribution-eval/code/gpt-4_stanford_strlen_within20k_random500_zero_shot_stanford_old_document_result.json"))
old = expertqa_old + stanford_old
json.dump(old, open("/research/nfs_sun_397/li.14042/projects/attribution-eval/code/gpt-4_expertqa_stanford_strlen_within20k_random500_zero_shot_old_document_result.json",'w'))