In [1]:
from datasets import load_dataset

coco_ds = load_dataset("small-coco")

In [3]:
print(type(coco_ds))
print(coco_ds)

<class 'datasets.dataset_dict.DatasetDict'>
DatasetDict({
    train: Dataset({
        features: ['image', 'caption', 'url', 'key', 'status', 'error_message', 'width', 'height', 'original_width', 'original_height', 'exif', 'sha256'],
        num_rows: 9890
    })
    validation: Dataset({
        features: ['image', 'caption', 'url', 'key', 'status', 'error_message', 'width', 'height', 'original_width', 'original_height', 'exif', 'sha256'],
        num_rows: 9893
    })
})


In [4]:
# validation这个键名不太合适，希望改成test
coco_ds['test'] = coco_ds.pop('validation')
print(coco_ds)

DatasetDict({
    train: Dataset({
        features: ['image', 'caption', 'url', 'key', 'status', 'error_message', 'width', 'height', 'original_width', 'original_height', 'exif', 'sha256'],
        num_rows: 9890
    })
    test: Dataset({
        features: ['image', 'caption', 'url', 'key', 'status', 'error_message', 'width', 'height', 'original_width', 'original_height', 'exif', 'sha256'],
        num_rows: 9893
    })
})


In [6]:
print(coco_ds.keys())
print(type(coco_ds['train'][0]['image'])) # 图片是.png格式
print(coco_ds['train'][10]['key'])
print(coco_ds['train'][10]['caption'])

dict_keys(['train', 'test'])
<class 'PIL.PngImagePlugin.PngImageFile'>
000000011
A young man holding an umbrella next to a herd of cattle.


In [8]:
# 提取数据集的数据
import os
from PIL import Image
import json
from tqdm import tqdm

total_item = sum(len(coco_ds[data_key]) for data_key in coco_ds.keys())
save_dir = "small-coco"

with tqdm(total = total_item,desc = "Extract data to local") as pbar:
    for data_key in coco_ds.keys():
        prefix = f"{data_key}-"
        output_file = f"{save_dir}/data_{data_key}.json" # 处理成json格式
        os.makedirs(os.path.join(save_dir,data_key),exist_ok = True)
        save_json = []
        for item in coco_ds[data_key]:
            img_name = f"{prefix}{item['key']}.png"
            img_save_path = os.path.join(save_dir,data_key,img_name)
            item['image'].save(img_save_path,format = "PNG") # 使用一次后关闭，防止重复存图片
            new_json = {
                "id": item['key'],
                "conversations": [
                  {
                    "from": "user",
                    "value": f"<|vision_start|>{img_save_path}<|vision_end|>"
                  },
                  {
                    "from": "assistant",
                    "value": item['caption']
                  }
                ]
            }
            save_json.append(new_json)
            pbar.update(1)
        with open(output_file,"w",encoding = 'utf-8') as f:
            json.dump(save_json,f,indent = 4,ensure_ascii = False)
print("Successfully save data to small-coco/data_train.json")
print("Successfully save data to small-coco/data_test.json")

Extract data to local: 100%|██████████| 19783/19783 [04:48<00:00, 68.59it/s]

Successfully save data to small-coco/data_train.json
Successfully save data to small-coco/data_test.json





In [19]:
import json

def cvt_data(file_path):
    """数据格式转换，便于dataset的构建"""
    loaded_data = None
    with open(file_path,"r",encoding = 'utf-8') as f:
        loaded_data = json.load(f)

    # 将loaded_data转换为合适的Dataset的格式
    dataset_dicts = []
    for item in loaded_data:
        user_content = item['conversations'][0]
        assistant_content = item['conversations'][1]

        # 提取图像信息和文本信息
        image_path = user_content['value'].split('<|vision_start|>')[1].split('<|vision_end|>')[0]

        # 构建新的字典
        dataset_dict = {
            "role": "user",
            "image_path": image_path,
            "question": "Please describle this image.", # 统一的问题
            'assistant_answer': assistant_content['value']
        }

        dataset_dicts.append(dataset_dict)
    return dataset_dicts

In [20]:
train_data_json = "small-coco/data_train.json"
test_data_json = "small-coco/data_test.json"
train_dataset_dicts = cvt_data(train_data_json)
test_dataset_dicts = cvt_data(test_data_json)

In [21]:
from datasets import Dataset
train_dataset = Dataset.from_list(train_dataset_dicts)
test_dataset = Dataset.from_list(test_dataset_dicts)

In [22]:
# 检测数据集是否正确
for i in range(3):
    print(train_dataset[i])

{'role': 'user', 'image_path': 'small-coco/train/train-000000000.png', 'question': 'Please describle this image.', 'assistant_answer': 'A man with a red helmet on a small moped on a dirt road. '}
{'role': 'user', 'image_path': 'small-coco/train/train-000000001.png', 'question': 'Please describle this image.', 'assistant_answer': 'Man riding a motor bike on a dirt road on the countryside.'}
{'role': 'user', 'image_path': 'small-coco/train/train-000000002.png', 'question': 'Please describle this image.', 'assistant_answer': 'A man riding on the back of a motorcycle.'}
