In [1]:
import json
import pandas as pd
import numpy as np
import os
import json
import utils
from tqdm import tqdm
import re
import uuid
import random

## AttributedQA
Only pick those data items with human_rating
Divide into train / dev / test

In [2]:
df = pd.read_csv("../raw_data/AttributedQA/ratings.csv", index_col=0)

filtered_df = df[df['human_rating'].isin(['Y', 'N'])]
print("len of filtered_df: ", len(filtered_df))

# 随机排列 DataFrame 的行
shuffled_df = filtered_df.sample(frac=1, random_state=42)

# 重置索引，以确保索引是连续的
shuffled_df = shuffled_df.reset_index(drop=True)

question_grouped_df = shuffled_df.groupby('question')
question_grouped_df.head()
print("len of question_grouped_df: ", len(question_grouped_df))
print(question_grouped_df.size().mean())

len of filtered_df:  21189
len of question_grouped_df:  1000
21.189


In [17]:
filtered_df.head()

Unnamed: 0,question,answer,passage,nli_score,human_rating,auto_ais,system_name,attribution
0,who played hyde in league of extraordinary gen...,Jason Flemyng,Title: Jason Flemyng\nSection: Television and ...,0.981469,Y,Y,Post-4,http://en.wikipedia.org/wiki/Jason_Flemyng#Jas...
1,who played hyde in league of extraordinary gen...,Jason Flemyng,Title: Jason Flemyng\nSection: Television and ...,0.981469,Y,Y,RTR+auto_ais_reranking,http://en.wikipedia.org/wiki/Jason_Flemyng#Jas...
2,who played hyde in league of extraordinary gen...,Jason Flemyng,Title: Jason Flemyng\nSection: Television and ...,0.981469,Y,Y,Post-2,http://en.wikipedia.org/wiki/Jason_Flemyng#Jas...
3,who played hyde in league of extraordinary gen...,Jason Flemyng,Title: Jason Flemyng\nSection: Television and ...,0.981469,Y,Y,Post-3,http://en.wikipedia.org/wiki/Jason_Flemyng#Jas...
4,who played hyde in league of extraordinary gen...,Jason Flemyng,Title: Jason Flemyng\nSection: Television and ...,0.981469,Y,Y,Post+auto_ais_reranking,http://en.wikipedia.org/wiki/Jason_Flemyng#Jas...


In [27]:
seen = set()
unique_data = []
cnt = 0
for i in range(len(filtered_df)):
    item = filtered_df.iloc[i]
    key = item["question"] + str(item["answer"]) + item["passage"]
    if key not in seen:
        seen.add(key)
        unique_data.append(item)
    else:
        if cnt < 2:
            print("question:", item["question"])
            print("answer:", item["answer"])
            print("passage:", item["passage"])
            print("label:", item["human_rating"])
            cnt += 1
print(f"Dataset Name: AttributedQA, total items: {len(filtered_df)}, total unique <question, claim, references> pairs: {len(unique_data)}")


question: who played hyde in league of extraordinary gentlemen
answer: Jason Flemyng
passage: Title: Jason Flemyng
Section: Television and film work

In the early 2000s he featured in two big-budget Hollywood films which were adaptations of Alan Moore comic books; as John Netley in 2001's From Hell, with Johnny Depp, and 2003's The League of Extraordinary Gentlemen, with Sean Connery, in which Flemyng played Dr. Henry Jekyll and Edward Hyde. The latter film was a disappointment, but Flemyng commented that: "It was a bit of a nightmare... the film cost a fortune and didn't make back the money it was meant to... But I still get a huge kick out of doing films like that and From Hell. Any day you walk onto a set and Sean Connery or Johnny Depp or Brad Pitt is there has to be a good day."
label: Y
question: who played hyde in league of extraordinary gentlemen
answer: Jason Flemyng
passage: Title: Jason Flemyng
Section: Television and film work

In the early 2000s he featured in two big-budg

Dataset Name: AttributedQA, total items: 21189, total unique <question, claim, references> pairs: 9432


In [3]:
# 定义比例
train_ratio = 0.85
dev_ratio = 0.05

# 创建用于存储划分后数据的新 DataFrame
train_data = pd.DataFrame(columns=shuffled_df.columns)
dev_data = pd.DataFrame(columns=shuffled_df.columns)
test_data = pd.DataFrame(columns=shuffled_df.columns)

total_count = len(shuffled_df)

# 遍历分组
for name, group in question_grouped_df:
    # 计算划分数量
    train_count = int(train_ratio * total_count)
    dev_count = int(dev_ratio * total_count)
    test_count = total_count - train_count - dev_count
    
    # 将数据添加到相应的集合中
    if len(train_data) < train_count:
        train_data = pd.concat([train_data, group])
    elif len(dev_data) < dev_count:
        dev_data = pd.concat([dev_data, group])
    else:
        test_data = pd.concat([test_data, group])

# 确保数据集的索引是连续的
train_data = train_data.reset_index(drop=True)
dev_data = dev_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# 打印划分后的数据集大小
print("Train Data Size:", len(train_data))
print("Dev Data Size:", len(dev_data))
print("Test Data Size:", len(test_data))

# 假设 train_data、test_data 和 dev_data 是包含数据的 DataFrame
# 请将以下代码中的变量名替换为您的实际变量名

# 提取每个数据集中的 question 列，并转换为集合
train_questions = set(train_data['question'])
test_questions = set(test_data['question'])
dev_questions = set(dev_data['question'])

# 计算 question 重叠数量
overlap_train_test = len(train_questions.intersection(test_questions))
overlap_train_dev = len(train_questions.intersection(dev_questions))
overlap_test_dev = len(test_questions.intersection(dev_questions))

# 打印重叠数量
print("Question Overlap between Train and Test:", overlap_train_test)
print("Question Overlap between Train and Dev:", overlap_train_dev)
print("Question Overlap between Test and Dev:", overlap_test_dev)

Train Data Size: 18024
Dev Data Size: 1070
Test Data Size: 2095
Question Overlap between Train and Test: 0
Question Overlap between Train and Dev: 0
Question Overlap between Test and Dev: 0


In [4]:
shuffled_train_data = train_data.sample(frac=1, random_state=42)
shuffled_dev_data = dev_data.sample(frac=1, random_state=42)
shuffled_test_data = test_data.sample(frac=1, random_state=42)

shuffled_train_data.to_csv("../our_data/AttributedQA/train.csv")
shuffled_dev_data.to_csv("../our_data/AttributedQA/dev.csv")
shuffled_test_data.to_csv("../our_data/AttributedQA/test.csv")

## ExpertQA

In [12]:
data = [json.loads(line) for line in open("../raw_data/ExpertQA/r2_compiled_anon.jsonl", 'r')]
question_to_data = {x["question"]: [] for x in data}
for x in data:
    question_to_data[x["question"]].append(x)
print(len(data))
print(len(question_to_data))

shuffled_question_to_data = list(question_to_data.items())
random.shuffle(shuffled_question_to_data)

total_count = len(shuffled_question_to_data)
train_count = int(0.85 * total_count)
dev_count = int(0.05 * total_count)
test_count = total_count - train_count - dev_count

# 划分数据集
train_dict = dict(shuffled_question_to_data[:train_count])
dev_dict = dict(shuffled_question_to_data[train_count:train_count+dev_count])
test_dict = dict(shuffled_question_to_data[train_count+dev_count:])

# 打印划分后的数据集大小
print("Train Data Size:", len(train_dict))
print("Dev Data Size:", len(dev_dict))
print("Test Data Size:", len(test_dict))

# 假设 train_dict、dev_dict 和 test_dict 分别是包含数据的字典
# 请将以下代码中的变量名替换为您的实际字典变量

# 创建集合来存储各个数据集中的 "question" 值
train_questions = set(train_dict.keys())
dev_questions = set(dev_dict.keys())
test_questions = set(test_dict.keys())

# 检查是否有 "question" 重叠
overlap_train_dev = train_questions.intersection(dev_questions)
overlap_train_test = train_questions.intersection(test_questions)
overlap_dev_test = dev_questions.intersection(test_questions)

# 打印重叠情况
print("Question Overlap between Train and Dev:", overlap_train_dev)
print("Question Overlap between Train and Test:", overlap_train_test)
print("Question Overlap between Dev and Test:", overlap_dev_test)

# 随机打乱 train_dict 中的元素
shuffled_train_data = []
for k in train_dict.keys():
    for x in train_dict[k]:
        shuffled_train_data.append(x)
random.shuffle(shuffled_train_data)

# 指定保存 train 数据的文件名，例如 "shuffled_train_data.jsonl"
train_output_file = "../our_data/ExpertQA/train.jsonl"

# 将随机打乱后的 train 数据写入 JSONL 文件
with open(train_output_file, 'w', encoding='utf-8') as train_file:
    for item in shuffled_train_data:
        json.dump(item, train_file)
        train_file.write('\n')

# 随机打乱 dev_dict 中的元素
shuffled_dev_data = []
for k in dev_dict.keys():
    for x in dev_dict[k]:
        shuffled_dev_data.append(x)
random.shuffle(shuffled_dev_data)

# 指定保存 dev 数据的文件名，例如 "shuffled_dev_data.jsonl"
dev_output_file = "../our_data/ExpertQA/dev.jsonl"

# 将随机打乱后的 dev 数据写入 JSONL 文件
with open(dev_output_file, 'w', encoding='utf-8') as dev_file:
    for item in shuffled_dev_data:
        json.dump(item, dev_file)
        dev_file.write('\n')

# 随机打乱 test_dict 中的元素
shuffled_test_data = []
for k in test_dict.keys():
    for x in test_dict[k]:
        shuffled_test_data.append(x)
random.shuffle(shuffled_test_data)

# 指定保存 test 数据的文件名，例如 "shuffled_test_data.jsonl"
test_output_file = "../our_data/ExpertQA/test.jsonl"

# 将随机打乱后的 test 数据写入 JSONL 文件
with open(test_output_file, 'w', encoding='utf-8') as test_file:
    for item in shuffled_test_data:
        json.dump(item, test_file)
        test_file.write('\n')

2177
2177
Train Data Size: 1850
Dev Data Size: 108
Test Data Size: 219
Question Overlap between Train and Dev: set()
Question Overlap between Train and Test: set()
Question Overlap between Dev and Test: set()


## HAGRID

In [13]:
def check_hagrid_data_item(data_item):
    for answer in data_item["answers"]:
        if "attributable" in answer:
            return True
    return False

train_data = [json.loads(line) for line in tqdm(open("../raw_data/hagrid/hagrid-v1.0-en/train.jsonl"))]
print(len(train_data))
print(len([item for item in train_data if check_hagrid_data_item(item)]))
dev_data = [json.loads(line) for line in tqdm(open("../raw_data/hagrid/hagrid-v1.0-en/dev.jsonl"))]
print(len(dev_data))
print(len([item for item in dev_data if check_hagrid_data_item(item)]))

1922it [00:00, 58878.39it/s]


1922
419


716it [00:00, 54261.84it/s]

716
623





In [14]:
# loading data from original dataset
train_data = [json.loads(line) for line in tqdm(open("../raw_data/hagrid/hagrid-v1.0-en/train.jsonl"))]
dev_data = [json.loads(line) for line in tqdm(open("../raw_data/hagrid/hagrid-v1.0-en/dev.jsonl"))]
data = train_data + dev_data
data = [_ for _ in data if check_hagrid_data_item(_)]
question_to_data = {x["query"]: [] for x in data}
for x in data:
    question_to_data[x["query"]].append(x)
print(len(data))
print(len(question_to_data))

import random

# 假设 question_to_data 是包含数据的字典
# 请将以下代码中的 "question_to_data" 替换为您的实际字典变量

# 随机打乱字典的键值对
shuffled_question_to_data = list(question_to_data.items())
random.shuffle(shuffled_question_to_data)

# 计算划分的数据量
total_count = len(shuffled_question_to_data)
train_count = int(0.85 * total_count)
dev_count = int(0.05 * total_count)
test_count = total_count - train_count - dev_count

# 划分数据集
train_dict = dict(shuffled_question_to_data[:train_count])
dev_dict = dict(shuffled_question_to_data[train_count:train_count+dev_count])
test_dict = dict(shuffled_question_to_data[train_count+dev_count:])

# 打印划分后的数据集大小
print("Train Data Size:", len(train_dict))
print("Dev Data Size:", len(dev_dict))
print("Test Data Size:", len(test_dict))

# 创建集合来存储各个数据集中的 "question" 值
train_questions = set(train_dict.keys())
dev_questions = set(dev_dict.keys())
test_questions = set(test_dict.keys())

# 检查是否有 "question" 重叠
overlap_train_dev = train_questions.intersection(dev_questions)
overlap_train_test = train_questions.intersection(test_questions)
overlap_dev_test = dev_questions.intersection(test_questions)

# 打印重叠情况
print("Question Overlap between Train and Dev:", overlap_train_dev)
print("Question Overlap between Train and Test:", overlap_train_test)
print("Question Overlap between Dev and Test:", overlap_dev_test)

# 随机打乱 train_dict 中的元素
shuffled_train_data = []
for k in train_dict.keys():
    for x in train_dict[k]:
        shuffled_train_data.append(x)
random.shuffle(shuffled_train_data)

# 指定保存 train 数据的文件名，例如 "shuffled_train_data.jsonl"
train_output_file = "../our_data/hagrid/train.jsonl"

# 将随机打乱后的 train 数据写入 JSONL 文件
with open(train_output_file, 'w', encoding='utf-8') as train_file:
    for item in shuffled_train_data:
        json.dump(item, train_file)
        train_file.write('\n')

# 随机打乱 dev_dict 中的元素
shuffled_dev_data = []
for k in dev_dict.keys():
    for x in dev_dict[k]:
        shuffled_dev_data.append(x)
random.shuffle(shuffled_dev_data)

# 指定保存 dev 数据的文件名，例如 "shuffled_dev_data.jsonl"
dev_output_file = "../our_data/hagrid/dev.jsonl"

# 将随机打乱后的 dev 数据写入 JSONL 文件
with open(dev_output_file, 'w', encoding='utf-8') as dev_file:
    for item in shuffled_dev_data:
        json.dump(item, dev_file)
        dev_file.write('\n')

# 随机打乱 test_dict 中的元素
shuffled_test_data = []
for k in test_dict.keys():
    for x in test_dict[k]:
        shuffled_test_data.append(x)
random.shuffle(shuffled_test_data)

# 指定保存 test 数据的文件名，例如 "shuffled_test_data.jsonl"
test_output_file = "../our_data/hagrid/test.jsonl"

# 将随机打乱后的 test 数据写入 JSONL 文件
with open(test_output_file, 'w', encoding='utf-8') as test_file:
    for item in shuffled_test_data:
        json.dump(item, test_file)
        test_file.write('\n')


1922it [00:00, 16020.53it/s]
716it [00:00, 53781.80it/s]


1042
1042
Train Data Size: 885
Dev Data Size: 52
Test Data Size: 105
Question Overlap between Train and Dev: set()
Question Overlap between Train and Test: set()
Question Overlap between Dev and Test: set()


## Stanford

In [16]:
data = [json.loads(line) for line in tqdm(open("../raw_data/Stanford-GenSearch/human_evaluation_annotations.jsonl"))]
question_to_data = {x["query"]: [] for x in data}
for x in data:
    question_to_data[x["query"]].append(x)
print(len(data))
print(len(question_to_data))

shuffled_question_to_data = list(question_to_data.items())
random.shuffle(shuffled_question_to_data)

total_count = len(shuffled_question_to_data)
train_count = int(0.85 * total_count)
dev_count = int(0.05 * total_count)
test_count = total_count - train_count - dev_count

# 划分数据集
train_dict = dict(shuffled_question_to_data[:train_count])
dev_dict = dict(shuffled_question_to_data[train_count:train_count+dev_count])
test_dict = dict(shuffled_question_to_data[train_count+dev_count:])

# 打印划分后的数据集大小
print("Train Data Size:", len(train_dict))
print("Dev Data Size:", len(dev_dict))
print("Test Data Size:", len(test_dict))

# 假设 train_dict、dev_dict 和 test_dict 分别是包含数据的字典
# 请将以下代码中的变量名替换为您的实际字典变量

# 创建集合来存储各个数据集中的 "question" 值
train_questions = set(train_dict.keys())
dev_questions = set(dev_dict.keys())
test_questions = set(test_dict.keys())

# 检查是否有 "question" 重叠
overlap_train_dev = train_questions.intersection(dev_questions)
overlap_train_test = train_questions.intersection(test_questions)
overlap_dev_test = dev_questions.intersection(test_questions)

# 打印重叠情况
print("Question Overlap between Train and Dev:", overlap_train_dev)
print("Question Overlap between Train and Test:", overlap_train_test)
print("Question Overlap between Dev and Test:", overlap_dev_test)

# 随机打乱 train_dict 中的元素
shuffled_train_data = []
for k in train_dict.keys():
    for x in train_dict[k]:
        shuffled_train_data.append(x)
random.shuffle(shuffled_train_data)

# 指定保存 train 数据的文件名，例如 "shuffled_train_data.jsonl"
train_output_file = "../our_data/Stanford-GenSearch/train.jsonl"

# 将随机打乱后的 train 数据写入 JSONL 文件
with open(train_output_file, 'w', encoding='utf-8') as train_file:
    for item in shuffled_train_data:
        json.dump(item, train_file)
        train_file.write('\n')

# 随机打乱 dev_dict 中的元素
shuffled_dev_data = []
for k in dev_dict.keys():
    for x in dev_dict[k]:
        shuffled_dev_data.append(x)
random.shuffle(shuffled_dev_data)

# 指定保存 dev 数据的文件名，例如 "shuffled_dev_data.jsonl"
dev_output_file = "../our_data/Stanford-GenSearch/dev.jsonl"

# 将随机打乱后的 dev 数据写入 JSONL 文件
with open(dev_output_file, 'w', encoding='utf-8') as dev_file:
    for item in shuffled_dev_data:
        json.dump(item, dev_file)
        dev_file.write('\n')

# 随机打乱 test_dict 中的元素
shuffled_test_data = []
for k in test_dict.keys():
    for x in test_dict[k]:
        shuffled_test_data.append(x)
random.shuffle(shuffled_test_data)

# 指定保存 test 数据的文件名，例如 "shuffled_test_data.jsonl"
test_output_file = "../our_data/Stanford-GenSearch/test.jsonl"

# 将随机打乱后的 test 数据写入 JSONL 文件
with open(test_output_file, 'w', encoding='utf-8') as test_file:
    for item in shuffled_test_data:
        json.dump(item, test_file)
        test_file.write('\n')

print("Train Data Size:", len(shuffled_train_data))
print("Dev Data Size:", len(shuffled_dev_data))
print("Test Data Size:", len(shuffled_test_data))

5528it [00:00, 30120.91it/s]


5528
1450
Train Data Size: 1232
Dev Data Size: 72
Test Data Size: 146
Question Overlap between Train and Dev: set()
Question Overlap between Train and Test: set()
Question Overlap between Dev and Test: set()
Train Data Size: 4704
Dev Data Size: 272
Test Data Size: 552


In [None]:
data[0]