# Image to Parquet

Convert images to binary and save them into a parquet file.

In [1]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.master("local[*]").appName("Images2Parquet").getOrCreate()
print(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/25 17:33:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<pyspark.sql.session.SparkSession object at 0x7fd9084d1630>


In [2]:
import os
import logging
from pathlib import Path

# Setup logging
logging.basicConfig(level=logging.INFO)

# Base directory using pathlib
curr_dir = Path(os.getcwd())
base_dir = curr_dir.parents[1] / "data" / "SnakeCLEF2023-small_size" / "2023"
num_folders = 20

# Ensure base directory exists
if not base_dir.is_dir():
    raise FileNotFoundError(f"Base directory {base_dir} does not exist.")

# Getting subfolders
subfolders = sorted([f.name for f in base_dir.iterdir() if f.is_dir()])[:num_folders]

In [3]:
# Schema for the DataFrame
from pyspark.sql.types import StructType, StructField, BinaryType, StringType
schema = StructType([
    StructField("path", StringType(), True),
    StructField("image_binary_data", BinaryType(), True)
])

# Function to convert image to binary
def image_to_binary(image_path):
    with open(image_path, 'rb') as file:
        return file.read()

In [4]:
# Create an empty RDD
image_rdd = spark.sparkContext.emptyRDD()

# Loop through subfolders and process images
for _, folder in enumerate(subfolders):
    folder_path = os.path.join(base_dir, folder)
    for img_name in os.listdir(folder_path):
        img_path = os.path.join(folder_path, img_name)
        binary_data = image_to_binary(img_path)
        image_rdd = image_rdd.union(spark.sparkContext.parallelize([(img_path, binary_data)]))

In [5]:
# Convert RDD to DataFrame
image_df = spark.createDataFrame(image_rdd, schema)

# Show the first few rows of image_df
image_df.show(n=5)

# Count the number of rows in image_df
row_count = image_df.count()
print(f"Number of rows in image_df: {row_count}")

                                                                                

+--------------------+--------------------+
|                path|   image_binary_data|
+--------------------+--------------------+
|/home/mgustine/sn...|[FF D8 FF E0 00 1...|
|/home/mgustine/sn...|[FF D8 FF E0 00 1...|
|/home/mgustine/sn...|[FF D8 FF E0 00 1...|
|/home/mgustine/sn...|[FF D8 FF E0 00 1...|
|/home/mgustine/sn...|[FF D8 FF E0 00 1...|
+--------------------+--------------------+
only showing top 5 rows





Number of rows in image_df: 72


                                                                                

In [6]:
# Base directory for parquet_files folder
data_dir = Path(os.getcwd()).parents[1] / "data"

# Create "parquet_files" directory if it doesn't exist
parquet_dir = data_dir / "parquet_files"
os.makedirs(parquet_dir, exist_ok=True)

In [7]:
# Path to the Parquet file
parquet_file_path = parquet_dir / "images_data.parquet"

# Save the DataFrame as a Parquet file
image_df.write.mode("overwrite").parquet(str(parquet_file_path))

                                                                                

In [8]:
def get_size_of_parquet(dir_path):
    total_size = 0
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            total_size += os.path.getsize(os.path.join(root, file))
    return total_size

# Get the size of the Parquet file (directory)
parquet_size = get_size_of_parquet(parquet_file_path)
print(f"Size of Parquet file: {parquet_size} bytes")

Size of Parquet file: 2897022 bytes


In [None]:
# Define the GCS path
gcs_path = "gs://dsgt-clef-snakeclef-2024/data/parquet_files/image_data"

# Write the DataFrame to GCS
image_df.write.mode("overwrite").parquet(gcs_path)

In [11]:
import site
import os
import subprocess

lib_dir="/home/username/.local/lib/python3.8/site-packages/pyspark/jars"

site_packages = site.getsitepackages()[0]
lib_dir = os.path.join(site_packages, "pyspark", "jars")
if not os.path.exists(lib_dir):
    print(f"PySpark jars directory does not exist: {lib_dir}")
else:
    subprocess.run(["wget", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar", "-P", lib_dir])

PySpark jars directory does not exist: /usr/local/lib/python3.10/dist-packages/pyspark/jars
