## Image to Parquet

In [1]:
%load_ext autoreload
%autoreload 2

### testing spark setup

In [2]:
# let's check that the initial data from murilo is theres
! gcloud storage ls gs://dsgt-clef-plantclef-2024/raw/ | head

gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_1.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_2.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_3.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_4.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.tar.gz
gs://dsgt-clef-plantclef-2024/raw/urls.txt


In [3]:
import os
from pathlib import Path
from pyspark.sql.functions import regexp_replace, split, element_at
from pyspark.sql import Row
from plantclef.utils import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/26 01:57:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/26 01:57:00 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
# Base directory using pathlib
curr_dir = Path(os.getcwd())
base_dir = curr_dir.parents[1] / "data" / "PlantCLEF2022_web_training_images_1"
base_dir

PosixPath('/home/mgustine/plantclef-2024/data/PlantCLEF2022_web_training_images_1')

In [5]:
# Load all files from the base directory as binary data
# Convert Path object to string when passing to PySpark
image_df = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.jpg")
    .option("recursiveFileLookup", "true")
    .load(base_dir.as_posix())
)

# Construct the string to be replaced - adjust this based on your actual base path
to_remove = "file:" + str(base_dir.parents[0])

# Remove "file:{base_dir.parents[0]" from path column
image_df = image_df.withColumn("path", regexp_replace("path", to_remove, ""))

# Split the path into an array of elements
split_path = split(image_df["path"], "/")

# Extract metadata from the file path
image_final_df = image_df.withColumn("file_name", element_at(split_path, -1))

# Select and rename columns to fit the target schema, including renaming 'content' to 'data'
image_final_df = image_final_df.select(
    "path",
    "file_name",
    image_final_df["content"].alias("data"),
)

# Create a new column "image_path" by removing "/images/" from "path"
image_final_df = image_final_df.withColumn(
    "image_path", regexp_replace("path", f"^/{base_dir.parts[-1]}/", "")
)

# Print Schema
image_final_df.printSchema()

                                                                                

root
 |-- path: string (nullable = true)
 |-- file_name: string (nullable = true)
 |-- data: binary (nullable = true)
 |-- image_path: string (nullable = true)



In [6]:
image_final_df.show(n=3)

                                                                                

+--------------------+--------------------+--------------------+--------------------+
|                path|           file_name|                data|          image_path|
+--------------------+--------------------+--------------------+--------------------+
|/PlantCLEF2022_we...|3d7803100f3db8326...|[FF D8 FF E0 00 1...|10757826/3d780310...|
|/PlantCLEF2022_we...|d601bf895e7519356...|[89 50 4E 47 0D 0...|11190409/d601bf89...|
|/PlantCLEF2022_we...|b5dbbb23698330a8c...|[89 50 4E 47 0D 0...|11201833/b5dbbb23...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



### join image_df with metadata from GCS

In [7]:
# Get list of stored files in cloud bucket
! gcloud storage ls gs://dsgt-clef-plantclef-2024/raw

gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_1.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_2.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_3.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_4.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.tar.gz
gs://dsgt-clef-plantclef-2024/raw/urls.txt


In [8]:
raw_root = "gs://dsgt-clef-plantclef-2024/raw/"
meta_dataset_name = "PlantCLEF2022_web_training_metadata.csv"

# Read the PlantCLEF 2022 metadata CSV file
meta_df = spark.read.csv(
    f"{raw_root}/{meta_dataset_name}",
    header=True,
    inferSchema=True,
    sep=";",  # specify semicolon as delimiter
)

# Cache the DataFrame to optimize subsequent operations
meta_df.cache()

# Drop duplicate entries based on 'image_path' before the join
meta_final_df = meta_df.dropDuplicates(["image_path"])

                                                                                

In [9]:
meta_final_df.show(n=3, truncate=100)



+--------+--------------------------------------------+-----------------------------------------------------+---------------------------+---------+----------+---------+-------------+----------+-------------+-------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
| classid|                                  image_name|                                           image_path|                    species|    genus|    family|    order|        class|manual_tag|predicted_tag|predicted_tag_probability|                                                                                        original_url|                                                                                    image_backup_url|
+--------+--------------------------------------------+-----------------------------------------------------+-------------------

                                                                                

In [10]:
meta_final_df.count()

                                                                                

1071627

In [11]:
# Perform an inner join on the 'image_path' column
final_df = image_final_df.join(meta_final_df, "image_path", "inner")

# Show the result to verify the join
final_df.show(n=3, truncate=100)

[Stage 18:>                                                         (0 + 1) / 1]

+-----------------------------------------------------+------------------------------------------------------------------------------------------+--------------------------------------------+----------------------------------------------------------------------------------------------------+--------+--------------------------------------------+----------------------------+---------+----------+---------+-------------+----------+-------------+-------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                           image_path|                                                                                      path|                                   file_name|                                                                                                data| classid|                         

                                                                                

In [12]:
final_df.count()

                                                                                

8290

In [13]:
# Print Schema
final_df.printSchema()

root
 |-- image_path: string (nullable = true)
 |-- path: string (nullable = true)
 |-- file_name: string (nullable = true)
 |-- data: binary (nullable = true)
 |-- classid: integer (nullable = true)
 |-- image_name: string (nullable = true)
 |-- species: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- family: string (nullable = true)
 |-- order: string (nullable = true)
 |-- class: string (nullable = true)
 |-- manual_tag: string (nullable = true)
 |-- predicted_tag: string (nullable = true)
 |-- predicted_tag_probability: double (nullable = true)
 |-- original_url: string (nullable = true)
 |-- image_backup_url: string (nullable = true)

