# Image to Parquet
Convert images to binary and save them into a parquet file.

In [4]:
import site
from pathlib import Path
from pyspark.sql import SparkSession

%load_ext autoreload
%autoreload 2

In [None]:
def get_gcs_connector_jar() -> str:
    # Assuming the JAR is installed in the user site-packages of PySpark
    user_site_packages = site.getusersitepackages()
    jars_dir = Path(user_site_packages) / "pyspark" / "jars"
    # Search for the GCS connector JAR
    jar = [jar for jar in jars_dir.glob("gcs-connector-hadoop3-*.jar")]
    return str(jar[0])


gcs_connector_jar = get_gcs_connector_jar()
# Initialize Spark Session
spark = (
    SparkSession.builder.appName("Image2Parquet")
    .master("local[*]")
    .config("spark.jars", gcs_connector_jar)
    .getOrCreate()
)
print(spark)

In [None]:
def get_gcs_connector_jar() -> str:
    # Assuming the JAR is installed in the user site-packages of PySpark
    user_site_packages = site.getusersitepackages()
    jars_dir = Path(user_site_packages) / "pyspark" / "jars"
    # Search for the GCS connector JAR
    jar = [jar for jar in jars_dir.glob("gcs-connector-hadoop3-*.jar")]
    return str(jar[0])


gcs_connector_jar = get_gcs_connector_jar()
gcs_connector_jar

In [None]:
# Set Hadoop configurations for GCS
sc = spark.sparkContext
sc._jsc.hadoopConfiguration().set(
    "fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem"
)
sc._jsc.hadoopConfiguration().set(
    "fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"
)

In [None]:
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

# Base directory using pathlib
curr_dir = Path(os.getcwd())
base_dir = curr_dir.parents[1]
data_dir = base_dir / "data" / "SnakeCLEF2023-small_size" / "2023"
num_folders = 20

# Ensure base directory exists
if not data_dir.is_dir():
    raise FileNotFoundError(f"Data directory {data_dir} does not exist.")

# Getting subfolders
subfolders = sorted([f.name for f in data_dir.iterdir() if f.is_dir()])[:num_folders]

In [None]:
# Schema for the DataFrame
from pyspark.sql.types import StructType, StructField, BinaryType, StringType

schema = StructType(
    [
        StructField("path", StringType(), True),
        StructField("folder_name", StringType(), True),
        StructField("year", StringType(), True),
        StructField("binomial_name", StringType(), True),
        StructField("file_name", StringType(), True),
        StructField("image_binary_data", BinaryType(), True),
    ]
)


# Function to convert image to binary
def image_to_binary(image_path):
    with open(image_path, "rb") as file:
        return file.read()

In [None]:
# Create an empty RDD
image_rdd = spark.sparkContext.emptyRDD()

# Loop through subfolders and process images
for folder in subfolders:
    folder_path = data_dir / folder
    for img_name in os.listdir(folder_path):
        img_path = folder_path / img_name
        relative_path = img_path.relative_to(base_dir)  # Get relative path
        relative_path = str(relative_path).split("data/")[-1]
        folder_name = relative_path.split("/")[0]
        year = relative_path.split("/")[1]
        binomial_name = relative_path.split("/")[2]
        file_name = relative_path.split("/")[-1]
        # print(f"{folder_name}, {year}, {binomial_name}, {file_name}")
        binary_data = image_to_binary(str(img_path))
        image_rdd = image_rdd.union(
            spark.sparkContext.parallelize(
                [
                    (
                        relative_path,
                        folder_name,
                        year,
                        binomial_name,
                        file_name,
                        binary_data,
                    )
                ]
            )
        )

In [None]:
# Convert RDD to DataFrame
image_df = spark.createDataFrame(image_rdd, schema)

# Show the first few rows of image_df
image_df.show(n=5)

In [None]:
# Base directory for parquet_files folder
data_dir = Path(os.getcwd()).parents[1] / "data"

# Create "parquet_files" directory if it doesn't exist
parquet_dir = data_dir / "parquet_files"
os.makedirs(parquet_dir, exist_ok=True)

In [None]:
# Path to the Parquet file
parquet_file_path = parquet_dir / "images_data.parquet"

# Save the DataFrame as a Parquet file
image_df.write.mode("overwrite").parquet(str(parquet_file_path))

In [None]:
image_df.printSchema()

In [None]:
def get_size_of_parquet(dir_path):
    total_size = 0
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            total_size += os.path.getsize(os.path.join(root, file))
    return total_size


# Get the size of the Parquet file (directory)
parquet_size = get_size_of_parquet(parquet_file_path)
print(f"Size of Parquet file: {parquet_size} bytes")

In [None]:
# Define the GCS path
gcs_path = "gs://dsgt-clef-snakeclef-2024/data/parquet_files/image_data"

# Write the DataFrame to GCS
image_df.write.mode("overwrite").parquet(gcs_path)

## Load DataFrame from GCS

In [1]:
from snakeclef.utils import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/03 18:13:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Define the GCS path to the Parquet file
gcs_parquet_path = "gs://dsgt-clef-snakeclef-2024/data/parquet_files/image_data"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(gcs_parquet_path)

# Show the data (for example, first few rows)
df.show(n=3)

24/02/03 18:13:25 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: gs://dsgt-clef-snakeclef-2024/data/parquet_files/image_data.
org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "gs"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:53)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366)
	at org.apache.spark.sql.DataFrameReader.loadV1Source

Py4JJavaError: An error occurred while calling o35.parquet.
: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "gs"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$1(DataSource.scala:724)
	at scala.collection.immutable.List.map(List.scala:293)
	at org.apache.spark.sql.execution.datasources.DataSource$.checkAndGlobPathIfNecessary(DataSource.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessary(DataSource.scala:551)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:404)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:563)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
import io
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image


def display_images_from_binary(image_data_list, binomial_names, grid_size=(3, 3)):
    """
    Display images in a grid with binomial names as labels.

    :param image_data_list: List of binary image data.
    :param binomial_names: List of binomial names corresponding to each image.
    :param grid_size: Tuple (rows, cols) representing the grid size.
    """
    # Unpack the number of rows and columns for the grid
    rows, cols = grid_size

    # Create a matplotlib subplot with the specified grid size
    fig, axes = plt.subplots(rows, cols, figsize=(12, 12), dpi=80)

    # Flatten the axes array for easy iteration if it's 2D
    axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]

    for ax, binary_data, name in zip(axes, image_data_list, binomial_names):
        # Convert binary data to an image and display it
        image = Image.open(io.BytesIO(binary_data))
        ax.imshow(image)
        name = name.replace("_", " ")
        ax.set_xlabel(name)  # Set the binomial name as xlabel
        ax.xaxis.label.set_size(14)  # Set the font size for the xlabel
        ax.set_xticks([])
        ax.set_yticks([])
    plt.tight_layout()
    plt.show()

In [None]:
# Collect binary image data from DataFrame
rows, cols = 3, 3
image_data_list = [row["image_binary_data"] for row in df.limit(rows * cols).collect()]
binomial_names = [row["binomial_name"] for row in df.limit(rows * cols).collect()]

# Display the images in a grid with binomial names
display_images_from_binary(image_data_list, binomial_names, grid_size=(3, 3))

## Join image DataFrame with Metadata files

In [None]:
# Get list of stored filed in cloud bucket
! gcloud storage ls gs://dsgt-clef-snakeclef-2024/raw

In [None]:
# Look for the Train Metadata CSV files
train_meta_hm_connector = (
    "gs://dsgt-clef-snakeclef-2024/raw/SnakeCLEF2023-TrainMetadata-HM.csv"
)
train_meta_inat_connector = (
    "gs://dsgt-clef-snakeclef-2024/raw/SnakeCLEF2023-TrainMetadata-iNat.csv"
)

# Read the Parquet file into a DataFrame
df_train_meta_hm = spark.read.option("header", True).csv(train_meta_hm_connector)
df_train_meta_inat = spark.read.option("header", True).csv(train_meta_inat_connector)

# Show the data (for example, first few rows)
df_train_meta_hm.show(n=3)
df_train_meta_inat.show(n=3)

In [None]:
# from pyspark.sql import functions as F

# df_hm_species = df_train_meta_hm.where(F.col("binomial_name") == "Thamnophis butleri")
# df_inat_species = df_train_meta_inat.where(F.col("binomial_name") == "Thamnophis butleri")

# df_hm_species.show(n=1, truncate=100, vertical=True)
# df_inat_species.show(n=1, truncate=100, vertical=True)

In [None]:
# print(f"HM species count:   {df_hm_species.count()}")
# print(f"iNat species count: {df_inat_species.count()}")

In [None]:
# # Base directory using pathlib
# curr_dir = Path(os.getcwd())
# base_dir = curr_dir.parents[1]
# data_dir = base_dir / "data" / "SnakeCLEF2023-small_size"

# # Ensure base directory exists
# if not data_dir.is_dir():
#     raise FileNotFoundError(f"Data directory {data_dir} does not exist.")

# # Getting subfolders
# folders = sorted([f.name for f in data_dir.iterdir() if f.is_dir()])

In [None]:
# # Loop through subfolders
# for folder in folders:
#     folder_path = data_dir / folder
#     subfolders = sorted([f.name for f in folder_path.iterdir() if f.is_dir()])
#     for subfolder in subfolders:
#         subfolder_path = folder_path / subfolder
#         for img_name in os.listdir(subfolder_path):
#             img_path = subfolder_path / img_name
#             if f"{subfolder}/{img_name}" == "Thamnophis_butleri/84733338.jpg":
#                 print(img_path)

In [None]:
# # Get list of HM image paths
# hm_list_rows = df_hm_species.select(["image_path"]).collect()
# hm_list = [row["image_path"] for row in hm_list_rows]
# print(hm_list[:5])

# # Get list of iNat image paths
# inat_list_rows = df_inat_species.select(["image_path"]).collect()
# inat_list = [row["image_path"] for row in inat_list_rows]
# print(inat_list[:5])

In [None]:
# # Check if image exists in dataset
# def check_images_exist(image_list:list) -> list:
#     image_path_list = []
#     for img in image_list:
#         img_split = img.split("/")
#         img_final = f"{img_split[1]}/{img_split[-1]}"
#         for folder in folders:
#             folder_path = data_dir / folder
#             subfolders = sorted([f.name for f in folder_path.iterdir() if f.is_dir()])
#             if "Thamnophis_butleri" not in subfolders:
#                 continue
#             else:
#                 for subfolder in subfolders:
#                     subfolder_path = folder_path / subfolder
#                     for img_name in os.listdir(subfolder_path):
#                         img_path = subfolder_path / img_name
#                         if f"{subfolder}/{img_name}" == img_final:
#                             image_path_list.append(img_path)
#     return image_path_list

In [None]:
# image_path_list = check_images_exist(image_list=inat_list)
# len(image_path_list)