In [2]:
from datasets import Dataset, Features, Value
import json
from PIL import Image
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# load images and save to bytes
def load_image_as_bytes(image_path):
    image = Image.open(image_path)
    # 调整图像大小（如果需要）
    image = image.resize((512, 512))  # 确保图像大小一致
    image_array = np.array(image)
    # 将 NumPy 数组转换为字节流
    return image_array.tobytes()

# load json file to create dataset
def load_custom_dataset(json_file_path, image_folder_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    dataset_data = {
        "image_name": [],
        "caption": [],
        "image": [],
    }
    
    # use tqdm to show loading progress
    for idx in tqdm(range(len(data["annotations"])), desc="Processing dataset"):
        annotation = data["annotations"][idx]
        image_path = os.path.join(image_folder_path, annotation["filepath"])
        
        dataset_data["image_name"].append(annotation["filepath"])
        dataset_data["caption"].append(annotation["caption"])
        dataset_data["image"].append(load_image_as_bytes(image_path))
    
    # define the features of the dataset
    features = Features({
        "image_name": Value("string"),
        "caption": Value("string"),
        "image": Value("binary")  # save to bytes stream
    })
    
    # create dataset
    dataset = Dataset.from_dict(dataset_data, features=features)
    return dataset

# convert Dataset to Parquet file
def dataset_to_parquet(dataset, output_path):
    # convert Dataset to Pandas DataFrame
    df = dataset.to_pandas()
    
    # create PyArrow Table
    table = pa.Table.from_pandas(df)
    
    # save to Parquet file
    pq.write_table(table, output_path)

# call to load dataset
json_file_path = "./DataPrep/data.json"  # JSON filepath
image_folder_path = "./DataPrep/images/"  # image filepath
dataset = load_custom_dataset(json_file_path, image_folder_path)

# dump to Parquet file
output_parquet_path = "./dataset.parquet"
dataset_to_parquet(dataset, output_parquet_path)
print(f"Dataset saved to {output_parquet_path}")

Processing dataset: 100%|████████████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 87.30it/s]


Dataset saved to ./dataset.parquet


In [3]:
# load the saved Parquet file
loaded_df = pd.read_parquet("dataset.parquet")

# check the 5th records
print(loaded_df.head())

         image_name                                            caption  \
0  img1_seismic.jpg  This seismic reflection image shows undeformed...   
1  img2_seismic.jpg  This seismic reflection image features a clear...   
2  img3_seismic.jpg  this seismic reflection image displays a layer...   
3  img4_seismic.jpg  This image shows a seismic reflection profile ...   
4  img5_seismic.jpg  The image appears to be a seismic reflection p...   

                                               image  
0  b'\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff...  
1  b'\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff...  
2  b'\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff...  
3  b'\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff...  
4  b'\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff...  


In [4]:
from datasets import load_dataset, Features, Value, Image
import numpy as np
from PIL import Image as PILImage

# load Parquet file
def load_parquet_dataset(file_path):
    dataset = load_dataset("parquet", data_files=file_path)
    return dataset

# restore bytes to image
def bytes_to_image(byte_data, shape=(512, 512, 3), dtype=np.uint8):
    try:
        # convert bytes to NumPy array
        image_array = np.frombuffer(byte_data, dtype=dtype)
        # reshape to image
        image_array = image_array.reshape(shape)
        # create the PILImage
        image = PILImage.fromarray(image_array)
        return image
    except Exception as e:
        print(f"Error converting byte data to image: {e}")
        return None

parquet_file_path = "./dataset.parquet"
dataset = load_parquet_dataset(parquet_file_path)

# use map to convert bytes to image
try:
    dataset = dataset.map(
        lambda x: {"image": bytes_to_image(x["image"])},
        batched=False,
        num_proc=6  # use 6 processes
    )
except Exception as e:
    print(f"Error during map operation: {e}")

# redefine the features
features = Features({
    "image_name": Value("string"),
    "caption": Value("string"),
    "image": Image()  
})

# convert dataset to new features
dataset = dataset.cast(features)

# QC
for sample in dataset["train"]:
    if isinstance(sample["image"], PILImage.Image):
        print("Image conversion successful.")
        print(sample["image"])  # print image sample
        break
    else:
        print("Image conversion failed.")
        break

dataset['train'][0]

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=6):   0%|          | 0/51 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/51 [00:00<?, ? examples/s]

Image conversion successful.
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512 at 0x155E4C550>


{'image_name': 'img1_seismic.jpg',
 'caption': 'This seismic reflection image shows undeformed, continuous, horizontal strata, characteristic of a structurally simple sedimentary sequence. The uniform layering and lack of disruptions suggest minimal tectonic activity, making it a likely representation of a stable depositional environment such as a marine shelf or deep-water basin.',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512>}

In [6]:
from datasets import DatasetDict

# split dataset per 80%, 10%, 10% ratio
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]

validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)
validation_dataset = validation_test_split["train"]
test_dataset = validation_test_split["test"]

# construct a new DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

dataset_dict["train"][0]

{'image_name': 'P2691_0130.png',
 'caption': 'This is an aerial image of a parking lot. In the bottom right corner of the image, there is a parking lot with many parked cars. There is a white building near the parking lot. In the top left corner of the image, there is a piece of wasteland. A road runs through the wasteland.',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512>}