# Image to Parquet
Convert images to binary and save them into a parquet file.

In [None]:
%load_ext autoreload
%autoreload 2

## Join image DataFrame with Metadata files

In [None]:
import os
from pathlib import Path
from pyspark.sql.functions import regexp_replace, split, element_at
from pyspark.sql import Row
from snakeclef.utils import get_spark

spark = get_spark()
display(spark)

In [None]:
# Base directory using pathlib
curr_dir = Path(os.getcwd())
base_dir = curr_dir.parents[1] / "data" / "SnakeCLEF2023-small_size"

# Load all files from the base directory as binary data
# Convert Path object to string when passing to PySpark
image_df = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.jpg")
    .option("recursiveFileLookup", "true")
    .load(base_dir.as_posix())
)

# Construct the string to be replaced - adjust this based on your actual base path
to_remove = "file:" + str(base_dir.parents[0])

# Remove "file:{base_dir.parents[0]" from path column
image_df = image_df.withColumn("path", regexp_replace("path", to_remove, ""))

# Split the path into an array of elements
split_path = split(image_df["path"], "/")

# Extract metadata from the file path
image_final_df = (
    image_df.withColumn("folder_name", element_at(split_path, -4))
    .withColumn("year", element_at(split_path, -3))
    .withColumn("binomial_name", element_at(split_path, -2))
    .withColumn("file_name", element_at(split_path, -1))
)

# Select and rename columns to fit the target schema, including renaming 'content' to 'image_binary_data'
image_final_df = image_final_df.select(
    "path",
    "folder_name",
    "year",
    "binomial_name",
    "file_name",
    image_final_df["content"].alias("data"),
)

# Create a new column "image_path" by removing "/SnakeCLEF2023-small_size/" from "path"
image_final_df = image_final_df.withColumn(
    "image_path", regexp_replace("path", f"^/{base_dir.parts[-1]}/", "")
)

# Print Schema
image_final_df.printSchema()

In [None]:
image_final_df.show(n=3, truncate=100)

### join image_df with metadata from GCS

In [None]:
# Get list of stored filed in cloud bucket
! gcloud storage ls gs://dsgt-clef-snakeclef-2024/raw

In [None]:
raw_root = "gs://dsgt-clef-snakeclef-2024/raw/"
meta_dataset_name = "SnakeCLEF2023-TrainMetadata-iNat"

# Read the iNaturalist metadata CSV file
meta_df = spark.read.csv(
    f"{raw_root}/{meta_dataset_name}.csv",
    header=True,
    inferSchema=True,
)

# Cache the DataFrame to optimize subsequent operations
meta_df.cache()

# Drop duplicate entries based on 'image_path' before the join
meta_df = meta_df.dropDuplicates(["image_path"])

# Drop 'binomial_name' column since before joining with image_final_df
meta_final_df = meta_df.drop("binomial_name")

In [None]:
meta_final_df.show(n=3, truncate=100)

In [None]:
meta_df.count()

In [None]:
# Perform an inner join on the 'image_path' column
final_df = image_final_df.join(meta_final_df, "image_path", "inner")

# Show the result to verify the join
final_df.show(n=3, truncate=100)

In [None]:
final_df.count()