In [1]:
from datasets import load_dataset

ds = load_dataset("refcocog") # 从本地读取

In [2]:
print(type(ds))
print(ds)

<class 'datasets.dataset_dict.DatasetDict'>
DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'image', 'bbox', 'image_size'],
        num_rows: 9602
    })
    val: Dataset({
        features: ['question_id', 'question', 'image', 'bbox', 'image_size'],
        num_rows: 4896
    })
})


In [3]:
print(len(ds['test']))

9602


In [4]:
# 查看每个数据的格式
for key,value in ds['test'][0].items():
    print(key,type(value))

question_id <class 'int'>
question <class 'str'>
image <class 'PIL.JpegImagePlugin.JpegImageFile'>
bbox <class 'str'>
image_size <class 'list'>


In [6]:
import os
from PIL import Image
import json
from tqdm import tqdm

# 计算总的数据量
total_items = sum(len(ds[data_key]) for data_key in ['test', 'val'])

# 初始化进度条(可自控制更新的进度条)
save_dir = "refcocog"
with tqdm(total=total_items, desc="Processing items") as pbar:
    for dataset_key in ['test','val']:
        output_file = f"data_{dataset_key}.jsonl"
        prefix = "test-" if dataset_key == 'test' else "val-"
        os.makedirs(os.path.join(save_dir,dataset_key),exist_ok = True)

        with open(os.path.join(save_dir,output_file),"w",encoding = 'utf-8') as f:
            for item in ds[dataset_key]:
                img_name = f"{prefix}{item['question_id']}.jpeg"
                img_save_path = os.path.join(save_dir,dataset_key,img_name)
                # item['image'].save(img_save_path,format = "JPEG")

                new_data_format = [
                    {
                        "role": "user",
                        "content":[
                            {
                                "type": "image",
                                "image": f"{img_save_path}"
                            },
                            {"type": "text","text":f"Please provide the bounding box for the following description: {item['question']}"}
                        ]
                    },
                    {
                        "role": "assistant",
                        "content": f'<|object_ref_start|>{item["question"]}<|object_ref_end|> is located at <|box_start|>{item["bbox"]}<|box_end|>'
                    }
                ]
                f.write(json.dumps(new_data_format,ensure_ascii = False) + "\n")
                pbar.update(1)
print("Data has been saved to data_test.jsonl and data_val.jsonl")

Processing items: 100%|██████████| 14498/14498 [00:44<00:00, 329.10it/s]

Data has been saved to data_test.jsonl and data_val.jsonl





In [9]:
import json
load_data_test = []
load_data_val = []
with open("refcocog/data_test.jsonl","r",encoding = 'utf-8') as f:
    for line in f:
        load_data_test.append(json.loads(line))
with open("refcocog/data_val.jsonl","r",encoding = 'utf-8') as f:
    for line in f:
        load_data_val.append(json.loads(line))

In [10]:
print(type(load_data_val),type(load_data_val[0]))
print(load_data_val[0])

<class 'list'> <class 'list'>
[{'role': 'user', 'content': [{'type': 'image', 'image': 'refcocog/val/val-61.jpeg'}, {'type': 'text', 'text': 'Please provide the bounding box for the following description: a bush of plant behind middle woman'}]}, {'role': 'assistant', 'content': '<|object_ref_start|>a bush of plant behind middle woman<|object_ref_end|> is located at <|box_start|>[285, 23, 424, 146]<|box_end|>'}]


In [12]:
import json
from datasets import Dataset

def load_and_convert_data(file_path):
    """加载并转换数据"""
    loaded_data = [] # 保存dataset描述的json列表
    with open(file_path,"r",encoding = 'utf-8') as f:
        for line in f:
            loaded_data.append(json.loads(line))
    
    # 将loaded_data转换为合适的Dataset的格式
    dataset_dicts = []
    for item in loaded_data:
        user_content = item[0]['content']
        assistant_content = item[1]['content']

        # 提取图像信息和文本信息
        image_info = next((x for x in user_content if x['type'] == 'image'),None)
        text_info = next((x for x in user_content if x['type'] == 'text'),None)

        # 构建新的字典
        dataset_dict = {
            "role": "user",
            "image_path": image_info['image'] if image_info else None,
            "question": text_info['text'] if text_info else None,
            'assistant_answer': assistant_content
        }

        dataset_dicts.append(dataset_dict)
    return dataset_dicts

# 分别加载 test 和 val 数据集
test_data_path = 'refcocog/data_test.jsonl'
val_data_path = 'refcocog/data_val.jsonl'

test_dataset_dicts = load_and_convert_data(test_data_path)
val_dataset_dicts = load_and_convert_data(val_data_path)

In [13]:
# 创建dataset对象
test_dataset = Dataset.from_list(test_dataset_dicts)
val_dataset = Dataset.from_list(val_dataset_dicts)

In [14]:
print(type(test_dataset))
print(test_dataset[0])
print(test_dataset[1])
print(test_dataset[2])

<class 'datasets.arrow_dataset.Dataset'>
{'role': 'user', 'image_path': 'refcocog/test/test-8.jpeg', 'question': 'Please provide the bounding box for the following description: the man in yellow coat', 'assistant_answer': '<|object_ref_start|>the man in yellow coat<|object_ref_end|> is located at <|box_start|>[374, 65, 510, 266]<|box_end|>'}
{'role': 'user', 'image_path': 'refcocog/test/test-9.jpeg', 'question': 'Please provide the bounding box for the following description: skiier in red pants', 'assistant_answer': '<|object_ref_start|>skiier in red pants<|object_ref_end|> is located at <|box_start|>[374, 65, 510, 266]<|box_end|>'}
{'role': 'user', 'image_path': 'refcocog/test/test-10.jpeg', 'question': 'Please provide the bounding box for the following description: there is red colored truck in between the other trucks', 'assistant_answer': '<|object_ref_start|>there is red colored truck in between the other trucks<|object_ref_end|> is located at <|box_start|>[93, 83, 597, 373]<|box_