# Image to Parquet

Convert images to binary and save them into a parquet file.

In [1]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = (
    SparkSession.builder
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    .master("local[*]").appName("Images2Parquet")
    .getOrCreate()
)
print(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/26 14:37:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<pyspark.sql.session.SparkSession object at 0x7f2440220dc0>


In [2]:
import os
import logging
from pathlib import Path

# Setup logging
logging.basicConfig(level=logging.INFO)

# Base directory using pathlib
curr_dir = Path(os.getcwd())
base_dir = curr_dir.parents[1]
data_dir = base_dir / "data" / "SnakeCLEF2023-small_size" / "2023"
num_folders = 20

# Ensure base directory exists
if not data_dir.is_dir():
    raise FileNotFoundError(f"Data directory {data_dir} does not exist.")

# Getting subfolders
subfolders = sorted([f.name for f in data_dir.iterdir() if f.is_dir()])[:num_folders]

In [3]:
# Schema for the DataFrame
from pyspark.sql.types import StructType, StructField, BinaryType, StringType
schema = StructType([
    StructField("path", StringType(), True),
    StructField("image_binary_data", BinaryType(), True)
])

# Function to convert image to binary
def image_to_binary(image_path):
    with open(image_path, 'rb') as file:
        return file.read()

In [10]:
# Create an empty RDD
image_rdd = spark.sparkContext.emptyRDD()

# Loop through subfolders and process images
for folder in subfolders:
    folder_path = data_dir / folder
    for img_name in os.listdir(folder_path):
        img_path = folder_path / img_name
        relative_path = img_path.relative_to(base_dir)  # Get relative path
        relative_path = str(relative_path).split("data/")[-1]
        print(relative_path)
        binary_data = image_to_binary(str(img_path))
        image_rdd = image_rdd.union(spark.sparkContext.parallelize([(relative_path, binary_data)]))

SnakeCLEF2023-small_size/2023/Acanthophis_antarcticus/250558438.jpeg
SnakeCLEF2023-small_size/2023/Acanthophis_antarcticus/250558444.jpeg
SnakeCLEF2023-small_size/2023/Acanthophis_laevis/250489238.jpeg
SnakeCLEF2023-small_size/2023/Acanthophis_praelongus/252303073.jpg
SnakeCLEF2023-small_size/2023/Acanthophis_rugosus/250956644.jpg
SnakeCLEF2023-small_size/2023/Acrochordus_granulatus/254291444.jpeg
SnakeCLEF2023-small_size/2023/Acrochordus_granulatus/253984428.jpg
SnakeCLEF2023-small_size/2023/Acrochordus_granulatus/253984330.jpg
SnakeCLEF2023-small_size/2023/Acrochordus_granulatus/253984113.jpg
SnakeCLEF2023-small_size/2023/Afrotyphlops_mucruso/252625785.jpeg
SnakeCLEF2023-small_size/2023/Afrotyphlops_mucruso/252624489.jpeg
SnakeCLEF2023-small_size/2023/Afrotyphlops_mucruso/252625813.jpeg
SnakeCLEF2023-small_size/2023/Afrotyphlops_mucruso/252625754.jpeg
SnakeCLEF2023-small_size/2023/Afrotyphlops_schlegelii/250454737.jpeg
SnakeCLEF2023-small_size/2023/Afrotyphlops_schlegelii/250454780.j

In [12]:
# Convert RDD to DataFrame
image_df = spark.createDataFrame(image_rdd, schema)

# Show the first few rows of image_df
image_df.show(n=5)

# Count the number of rows in image_df
row_count = image_df.count()
print(f"Number of rows in image_df: {row_count}")

                                                                                

+--------------------+--------------------+
|                path|   image_binary_data|
+--------------------+--------------------+
|SnakeCLEF2023-sma...|[FF D8 FF E0 00 1...|
|SnakeCLEF2023-sma...|[FF D8 FF E0 00 1...|
|SnakeCLEF2023-sma...|[FF D8 FF E0 00 1...|
|SnakeCLEF2023-sma...|[FF D8 FF E0 00 1...|
|SnakeCLEF2023-sma...|[FF D8 FF E0 00 1...|
+--------------------+--------------------+
only showing top 5 rows





Number of rows in image_df: 72


                                                                                

In [None]:
# Base directory for parquet_files folder
data_dir = Path(os.getcwd()).parents[1] / "data"

# Create "parquet_files" directory if it doesn't exist
parquet_dir = data_dir / "parquet_files"
os.makedirs(parquet_dir, exist_ok=True)

In [None]:
# Path to the Parquet file
parquet_file_path = parquet_dir / "images_data.parquet"

# Save the DataFrame as a Parquet file
image_df.write.mode("overwrite").parquet(str(parquet_file_path))

In [None]:
def get_size_of_parquet(dir_path):
    total_size = 0
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            total_size += os.path.getsize(os.path.join(root, file))
    return total_size

# Get the size of the Parquet file (directory)
parquet_size = get_size_of_parquet(parquet_file_path)
print(f"Size of Parquet file: {parquet_size} bytes")

In [None]:
# Define the GCS path
gcs_path = "gs://dsgt-clef-snakeclef-2024/data/parquet_files/image_data"

# Write the DataFrame to GCS
image_df.write.mode("overwrite").parquet(gcs_path)