In [1]:
from dataset_loader import DatasetLoader
import json
import pandas as pd
import os
import json
import utils
from tqdm import tqdm
import random
random.seed(42)

In [2]:
merged_train = [json.loads(l) for l in open("../hf_data/merged_train.jsonl")]
merged_dev = [json.loads(l) for l in open("../hf_data/merged_dev.jsonl")]
merged_test = [json.loads(l) for l in open("../hf_data/merged_test.jsonl")]

In [5]:
merged_train[15000]

{'question': 'What can severe pain in the right side of the belly button indicate?',
 'claim': 'Other possible symptoms include fever, vomiting, and loss of appetite[4].',
 'claim_raw_string': 'Other possible symptoms include fever, vomiting, and loss of appetite[4].',
 'response': 'Severe pain in the right side of the belly button could potentially indicate appendicitis. This condition is characterized by the inflammation of the appendix, a body part which scientists believe used to play an important role in the human digestive system but has no use in the human body now[4]. One of the early symptoms of appendicitis is a severe and sudden pain that starts near the belly button and may move toward the lower right side of the abdomen[3]. Other possible symptoms include fever, vomiting, and loss of appetite[4]. If suspected, appendicitis requires immediate medical care as it can be fatal[3]. Diagnosis of appendicitis usually involves doctors performing tests to rule out other potential c

In [3]:
def display_ratio(data_list, src_dataset="any"):
    attributable_count = sum(1 for item in data_list if item['attribution_label'] == 'attributable' and (src_dataset == "any" or item["src_dataset"] == src_dataset))
    not_attributable_count = sum(1 for item in data_list if item['attribution_label'] == 'not attributable' and (src_dataset == "any" or item["src_dataset"] == src_dataset))
    total_count = attributable_count + not_attributable_count
    
    attributable_ratio = attributable_count / total_count
    not_attributable_ratio = not_attributable_count / total_count
    ratio_ratio = attributable_ratio / not_attributable_ratio
    
    print(f"Total Count: {total_count}")
    print(f"Attributable Count: {attributable_count} ({attributable_ratio:.2%})")
    print(f"Not Attributable Count: {not_attributable_count} ({not_attributable_ratio:.2%})")
    print(f"Attributable / Not Attributable Ratio: {ratio_ratio:.2f}")
    print("")

display_ratio(merged_train)
display_ratio(merged_train, src_dataset="AttributedQA")
display_ratio(merged_train, src_dataset="HAGRID")
display_ratio(merged_train, src_dataset="ExpertQA")
display_ratio(merged_train, src_dataset="Stanford-GenSearch")

Total Count: 28058
Attributable Count: 18639 (66.43%)
Not Attributable Count: 9419 (33.57%)
Attributable / Not Attributable Ratio: 1.98

Total Count: 8253
Attributable Count: 3053 (36.99%)
Not Attributable Count: 5200 (63.01%)
Attributable / Not Attributable Ratio: 0.59

Total Count: 2977
Attributable Count: 2248 (75.51%)
Not Attributable Count: 729 (24.49%)
Attributable / Not Attributable Ratio: 3.08

Total Count: 8827
Attributable Count: 6092 (69.02%)
Not Attributable Count: 2735 (30.98%)
Attributable / Not Attributable Ratio: 2.23

Total Count: 8001
Attributable Count: 7246 (90.56%)
Not Attributable Count: 755 (9.44%)
Attributable / Not Attributable Ratio: 9.60



In [4]:
attributedqa_train = [item for item in merged_train if item["src_dataset"] == "AttributedQA"]
attributedqa_dev = [item for item in merged_dev if item["src_dataset"] == "AttributedQA"]
attributedqa_test = [item for item in merged_test if item["src_dataset"] == "AttributedQA"]
hagrid_train = [item for item in merged_train if item["src_dataset"] == "HAGRID"]
hagrid_dev = [item for item in merged_dev if item["src_dataset"] == "HAGRID"]
hagrid_test = [item for item in merged_test if item["src_dataset"] == "HAGRID"]
expertqa_train = [item for item in merged_train if item["src_dataset"] == "ExpertQA"]
expertqa_dev = [item for item in merged_dev if item["src_dataset"] == "ExpertQA"]
expertqa_test = [item for item in merged_test if item["src_dataset"] == "ExpertQA"]
stanford_train = [item for item in merged_train if item["src_dataset"] == "Stanford-GenSearch"]
stanford_dev = [item for item in merged_dev if item["src_dataset"] == "Stanford-GenSearch"]
stanford_test = [item for item in merged_test if item["src_dataset"] == "Stanford-GenSearch"]

In [5]:
import random

def balance_data(data_list, src_dataset="any"):
    # 分离 "attributable" 和 "not attributable" 数据
    attributable_data = [item for item in data_list if item['attribution_label'] == 'attributable']
    not_attributable_data = [item for item in data_list if item['attribution_label'] == 'not attributable']
    
    # 计算两类数据的数量
    min_count = min(len(attributable_data), len(not_attributable_data))
    if src_dataset == "AttributedQA":
        min_count = min(min_count, 1000)
    
    # 随机采样，使得两类数据的数量相等
    balanced_data = random.sample(attributable_data, min_count) + random.sample(not_attributable_data, min_count)
    
    return balanced_data

balanced_attributedqa_train = balance_data(attributedqa_train, src_dataset="AttributedQA")
balanced_hagrid_train = balance_data(hagrid_train)
balanced_expertqa_train = balance_data(expertqa_train)
balanced_stanford_train = balance_data(stanford_train)
balanced_attributed_dev = balance_data(attributedqa_dev)
balanced_hagrid_dev = balance_data(hagrid_dev)
balanced_expertqa_dev = balance_data(expertqa_dev)
balanced_stanford_dev = balance_data(stanford_dev)
balanced_attributed_test = balance_data(attributedqa_test)
balanced_hagrid_test = balance_data(hagrid_test)
balanced_expertqa_test = balance_data(expertqa_test)
balanced_stanford_test = balance_data(stanford_test)
balanced_train = balanced_attributedqa_train + balanced_hagrid_train + balanced_expertqa_train + balanced_stanford_train
balanced_dev = balanced_attributed_dev + balanced_hagrid_dev + balanced_expertqa_dev + balanced_stanford_dev
balanced_test = balanced_attributed_test + balanced_hagrid_test + balanced_expertqa_test + balanced_stanford_test
print("After balancing:")
display_ratio(balanced_attributedqa_train)
display_ratio(balanced_hagrid_train)
display_ratio(balanced_expertqa_train)
display_ratio(balanced_stanford_train)
display_ratio(balanced_attributed_dev)
display_ratio(balanced_hagrid_dev)
display_ratio(balanced_expertqa_dev)
display_ratio(balanced_stanford_dev)
display_ratio(balanced_attributed_test)
display_ratio(balanced_hagrid_test)
display_ratio(balanced_expertqa_test)
display_ratio(balanced_stanford_test)

After balancing:
Total Count: 2000
Attributable Count: 1000 (50.00%)
Not Attributable Count: 1000 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 1458
Attributable Count: 729 (50.00%)
Not Attributable Count: 729 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 5470
Attributable Count: 2735 (50.00%)
Not Attributable Count: 2735 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 1510
Attributable Count: 755 (50.00%)
Not Attributable Count: 755 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 336
Attributable Count: 168 (50.00%)
Not Attributable Count: 168 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 64
Attributable Count: 32 (50.00%)
Not Attributable Count: 32 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 334
Attributable Count: 167 (50.00%)
Not Attributable Count: 167 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 68
Attributable Count: 34 (50.00%)
Not At

In [7]:
with open("../hf_data/train_all_subset_balanced.jsonl", "w") as f:
    for item in balanced_train:
        f.write(json.dumps(item) + "\n")
with open("../hf_data/dev_all_subset_balanced.jsonl", "w") as f:
    for item in balanced_dev:
        f.write(json.dumps(item) + "\n")
with open("../hf_data/test_all_subset_balanced.jsonl", "w") as f:
    for item in balanced_test:
        f.write(json.dumps(item) + "\n")

In [8]:
print(len(balanced_train))
print(len(balanced_dev))
print(len(balanced_test))

10438
802
1658


In [17]:
merged_train_sampled = random.sample(merged_train, k=10500)
with open("../hf_data/merged_train_sampled.jsonl", "w") as f:
    for item in merged_train_sampled:
        f.write(json.dumps(item) + "\n")

In [6]:
print(len([x for x in balanced_train if x["src_dataset"] == "AttributedQA"]))
print(len([x for x in balanced_train if x["src_dataset"] == "HAGRID"]))
print(len([x for x in balanced_train if x["src_dataset"] == "ExpertQA"]))
print(len([x for x in balanced_train if x["src_dataset"] == "Stanford-GenSearch"]))

2000
1458
5470
1510


In [10]:
print(len([x for x in balanced_dev if x["src_dataset"] == "AttributedQA"]))
print(len([x for x in balanced_dev if x["src_dataset"] == "HAGRID"]))
print(len([x for x in balanced_dev if x["src_dataset"] == "ExpertQA"]))
print(len([x for x in balanced_dev if x["src_dataset"] == "Stanford-GenSearch"]))

336
64
334
68


In [7]:
print(len([x for x in balanced_test if x["src_dataset"] == "AttributedQA"]))
print(len([x for x in balanced_test if x["src_dataset"] == "HAGRID"]))
print(len([x for x in balanced_test if x["src_dataset"] == "ExpertQA"]))
print(len([x for x in balanced_test if x["src_dataset"] == "Stanford-GenSearch"]))

716
174
620
148


In [11]:
import random

def balance_data_overall(data_list):
    # 分离 "attributable" 和 "not attributable" 数据
    attributable_data = [item for item in data_list if item['attribution_label'] == 'attributable']
    not_attributable_data = [item for item in data_list if item['attribution_label'] == 'not attributable']
    
    # 计算两类数据的数量
    min_count = min(len(attributable_data), len(not_attributable_data)) - 4000
    
    # 随机采样，使得两类数据的数量相等
    balanced_data = random.sample(attributable_data, min_count) + random.sample(not_attributable_data, min_count)
    
    return balanced_data

balanced_train_overall = balance_data_overall(merged_train)

In [12]:
display_ratio(balanced_train_overall)
display_ratio(balanced_train_overall, src_dataset="AttributedQA")
display_ratio(balanced_train_overall, src_dataset="HAGRID")
display_ratio(balanced_train_overall, src_dataset="ExpertQA")
display_ratio(balanced_train_overall, src_dataset="Stanford-GenSearch")

Total Count: 10838
Attributable Count: 5419 (50.00%)
Not Attributable Count: 5419 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 3833
Attributable Count: 854 (22.28%)
Not Attributable Count: 2979 (77.72%)
Attributable / Not Attributable Ratio: 0.29

Total Count: 1117
Attributable Count: 684 (61.24%)
Not Attributable Count: 433 (38.76%)
Attributable / Not Attributable Ratio: 1.58

Total Count: 3346
Attributable Count: 1772 (52.96%)
Not Attributable Count: 1574 (47.04%)
Attributable / Not Attributable Ratio: 1.13

Total Count: 2542
Attributable Count: 2109 (82.97%)
Not Attributable Count: 433 (17.03%)
Attributable / Not Attributable Ratio: 4.87



In [13]:
with open("../hf_data/train_overall_balanced.jsonl", "w") as f:
    for item in balanced_train_overall:
        f.write(json.dumps(item) + "\n")

In [14]:
display_ratio(merged_train)

Total Count: 28058
Attributable Count: 18639 (66.43%)
Not Attributable Count: 9419 (33.57%)
Attributable / Not Attributable Ratio: 1.98



In [15]:
cnt = {}
cnt["AttributedQA"] = 0
cnt["HAGRID"] = 0
cnt["ExpertQA"] = 0
cnt["Stanford-GenSearch"] = 0
for data in merged_train:
    if "" in data["references"] or None in data["references"]:
        cnt[data["src_dataset"]] += 1
        print(data)
print(cnt)

{'AttributedQA': 0, 'HAGRID': 0, 'ExpertQA': 0, 'Stanford-GenSearch': 0}


In [22]:
train_v20 = [json.loads(l) for l in open("../hf_data/train_all_subset_balanced.jsonl")]
train_v21 = [json.loads(l) for l in open("../hf_data/train_overall_balanced.jsonl")]
train_v22 = [json.loads(l) for l in open("../hf_data/merged_train_sampled.jsonl")]
train_v23 = [json.loads(l) for l in open("../hf_data/merged_train.jsonl")]
test = [json.loads(l) for l in open("../hf_data/test_all_subset_balanced.jsonl")]
display_ratio(train_v20, src_dataset="ExpertQA")
display_ratio(train_v21, src_dataset="ExpertQA")
display_ratio(train_v22, src_dataset="ExpertQA")
display_ratio(train_v23, src_dataset="ExpertQA")
display_ratio(test, src_dataset="ExpertQA")

Total Count: 5470
Attributable Count: 2735 (50.00%)
Not Attributable Count: 2735 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 3346
Attributable Count: 1772 (52.96%)
Not Attributable Count: 1574 (47.04%)
Attributable / Not Attributable Ratio: 1.13

Total Count: 3333
Attributable Count: 2326 (69.79%)
Not Attributable Count: 1007 (30.21%)
Attributable / Not Attributable Ratio: 2.31

Total Count: 8827
Attributable Count: 6092 (69.02%)
Not Attributable Count: 2735 (30.98%)
Attributable / Not Attributable Ratio: 2.23

Total Count: 620
Attributable Count: 310 (50.00%)
Not Attributable Count: 310 (50.00%)
Attributable / Not Attributable Ratio: 1.00



## test data downsample

In [23]:
import json
import random
random.seed(42)
from math import ceil

merged_test = [json.loads(l) for l in open("/ML-A100/home/xiangyue/lyf/AttributionBench/data/hf_data/test_all_subset_balanced.jsonl")]

def balance_data_merged_test(data_list, src_dataset="any", ratio=0):
    if src_dataset != "any":
        data_list = [item for item in data_list if item["src_dataset"] == src_dataset]

    # 分离 "attributable" 和 "not attributable" 数据
    attributable_data = [item for item in data_list if item['attribution_label'] == 'attributable']
    not_attributable_data = [item for item in data_list if item['attribution_label'] == 'not attributable']
    
    # 计算两类数据的数量
    min_count = round(int(min(len(attributable_data), len(not_attributable_data)))*ratio + 0.1)  # for being total 500
    
    # 随机采样，使得两类数据的数量相等
    balanced_data = random.sample(attributable_data, min_count) + random.sample(not_attributable_data, min_count)
    
    return balanced_data
sample_num = 500
ratio = sample_num / len(merged_test)
attributedqa_test_sampled = balance_data_merged_test(merged_test, src_dataset="AttributedQA", ratio=ratio)
hagrid_test_sampled = balance_data_merged_test(merged_test, src_dataset="HAGRID", ratio=ratio)
expertqa_test_sampled = balance_data_merged_test(merged_test, src_dataset="ExpertQA", ratio=ratio)
stanford_test_sampled = balance_data_merged_test(merged_test, src_dataset="Stanford-GenSearch", ratio=ratio)
merged_test_sampled = attributedqa_test_sampled + hagrid_test_sampled + expertqa_test_sampled + stanford_test_sampled
display_ratio(merged_test_sampled)
display_ratio(attributedqa_test_sampled)
display_ratio(hagrid_test_sampled)
display_ratio(expertqa_test_sampled)
display_ratio(stanford_test_sampled)
with open("../hf_data/test_all_subset_balanced_sampled500.jsonl", "w") as f:
    for item in merged_test_sampled:
        f.write(json.dumps(item) + "\n")

Total Count: 500
Attributable Count: 250 (50.00%)
Not Attributable Count: 250 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 216
Attributable Count: 108 (50.00%)
Not Attributable Count: 108 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 52
Attributable Count: 26 (50.00%)
Not Attributable Count: 26 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 188
Attributable Count: 94 (50.00%)
Not Attributable Count: 94 (50.00%)
Attributable / Not Attributable Ratio: 1.00

Total Count: 44
Attributable Count: 22 (50.00%)
Not Attributable Count: 22 (50.00%)
Attributable / Not Attributable Ratio: 1.00



Total Count: 500
Attributable Count: 250 (50.00%)
Not Attributable Count: 250 (50.00%)
Attributable / Not Attributable Ratio: 1.00

