In [9]:
import os, json
import pandas as pd
import numpy as np
import pyarrow as pa
from PIL import Image
import pyarrow.parquet as pq
import sys
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [10]:
def filter_corrupted_images(file_paths):
    valid_images = []
    corrupted_images = []

    for path in file_paths:
        try:
            img = Image.open(path)
            img.verify()  # Verify if the image is valid
            valid_images.append(path)  # Add valid images to the list
        except (IOError, SyntaxError) as e:
            corrupted_images.append(path)  # Add corrupted images to the list
            #print(f"Corrupted image removed: {path} - {e}")

    print(f"Total corrupted images removed: {len(corrupted_images)}")
    return valid_images

In [17]:
import io


batch_size = 100  # Количество изображений в одном батче
image_dir = '../data/coco2017/test2017'  # Директория с изображениями
output_file = 'test2017.parquet'  # Файл, куда будут сохраняться батчи


schema = pa.schema([
    ('image_name', pa.string()),
    ('image_bin', pa.binary())
])

# Initialize the Parquet writer
writer = pq.ParquetWriter(output_file, schema)

# Функция для преобразования изображения в байтовый формат
def image_to_bytes(image_path):
    with Image.open(image_path) as img:
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format=img.format)  # Сохранение в байтовый поток
        return img_byte_arr.getvalue()

# Получение всех файлов изображений
image_files = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir) if fname.endswith(('jpg', 'jpeg', 'png'))]

# Итерация по файлам изображений батчами
for i in range(0, len(image_files), batch_size):
    batch_files = image_files[i:i + batch_size]  # Текущий батч файлов

    batch_files = filter_corrupted_images(batch_files)    

    # Преобразование изображений в байты
    data = [image_to_bytes(path) for path in batch_files]
    
    df = pd.DataFrame(data)

    writer.write_batch(pa.RecordBatch.from_arrays([batch_files, data], names=["image_name", "image_bin"]))

writer.close()
    

Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupted images removed: 0
Total corrupte

In [None]:
import pandas as pd

# Параметры
batch_size = 100  # Количество строк для чтения за один раз

# Функция для преобразования байтовых данных обратно в изображение
def bytes_to_image(image_bytes):
    img_byte_arr = io.BytesIO(image_bytes)
    img = Image.open(img_byte_arr)
    return img

parquet_file = pq.ParquetFile(output_file)
    
for batch in parquet_file.iter_batches(batch_size=batch_size):
    df = batch.to_pandas()
    for _, row in df.iterrows():
        image = bytes_to_image(row['image_bin'])
        image.show()  # Отобразить изображение


KeyboardInterrupt: 

In [16]:
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x11f560180>
  created_by: parquet-cpp-arrow version 17.0.0
  num_columns: 2
  num_rows: 105693
  num_row_groups: 1061
  format_version: 2.6
  serialized_size: 301366