## Image to Parquet

In [1]:
%load_ext autoreload
%autoreload 2

### testing spark setup

In [2]:
# let's check that the initial data from murilo is theres
! gcloud storage ls gs://dsgt-clef-plantclef-2024/raw/ | head

gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_1.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_2.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_3.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_4.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.tar
gs://dsgt-clef-plantclef-2024/raw/args.yaml
gs://dsgt-clef-plantclef-2024/raw/class_mapping.txt


In [3]:
import os
from pathlib import Path
from pyspark.sql import functions as F
from plantclef.utils import get_spark

spark = get_spark(cores=8, memory="28g", **{"spark.sql.shuffle.partitions": 500})
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/23 22:12:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/23 22:12:02 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
# Base directory using pathlib
curr_dir = Path(os.getcwd())
base_dir = curr_dir.parents[1] / "data" / "PlantCLEF2024"
base_dir

PosixPath('/home/mgustine/plantclef-2024/data/PlantCLEF2024')

In [5]:
# Load all files from the base directory as binary data
# Convert Path object to string when passing to PySpark
image_df = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.jpg")
    .option("recursiveFileLookup", "true")
    .load(base_dir.as_posix())
)

# Construct the string to be replaced - adjust this based on your actual base path
to_remove = "file:" + str(base_dir.parents[0])

# Remove "file:{base_dir.parents[0]" from path column
image_df = image_df.withColumn("path", F.regexp_replace("path", to_remove, ""))

# Split the path into an array of elements
split_path = F.split(image_df["path"], "/")

# Select and rename columns to fit the target schema, including renaming 'content' to 'data'
image_final_df = image_df.select(
    "path",
    F.element_at(split_path, -1).alias("image_name"),
    F.col("content").alias("data"),
)

# Print Schema
image_final_df.printSchema()

24/03/23 22:12:50 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.


root
 |-- path: string (nullable = true)
 |-- image_name: string (nullable = true)
 |-- data: binary (nullable = true)



In [6]:
image_final_df.show(n=3)

                                                                                

+--------------------+--------------------+--------------------+
|                path|          image_name|                data|
+--------------------+--------------------+--------------------+
|/PlantCLEF2024/tr...|2a286d5d4daa2daf4...|[FF D8 FF E0 00 1...|
|/PlantCLEF2024/tr...|11ca311532b09f32e...|[FF D8 FF E0 00 1...|
|/PlantCLEF2024/tr...|cf66eb2121e182743...|[FF D8 FF E0 00 1...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



### join image_df with metadata from GCS

In [7]:
# Get list of stored files in cloud bucket
! gcloud storage ls gs://dsgt-clef-plantclef-2024/raw

gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_1.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_2.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_3.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_4.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.tar
gs://dsgt-clef-plantclef-2024/raw/args.yaml
gs://dsgt-clef-plantclef-2024/raw/class_mapping.txt
gs://dsgt-clef-plantclef-2024/raw/index.html
gs://dsgt-clef-plantclef-2024/raw/model_best.pth.tar
gs://dsgt-clef-plantclef-2024/raw/readme.txt
gs://dsgt-clef-plantclef-2024/raw/summary.csv
gs://dsgt-clef-plantclef-2024/raw/urls.txt


In [8]:
raw_root = "gs://dsgt-clef-plantclef-2024/raw/"
meta_dataset_name = "PlantCLEF2024singleplanttrainingdata.csv"

# Read the PlantCLEF 2022 metadata CSV file
meta_df = spark.read.csv(
    f"{raw_root}/{meta_dataset_name}",
    header=True,
    inferSchema=True,
    sep=";",  # specify semicolon as delimiter
)

# Drop duplicate entries based on 'image_path' before the join
meta_final_df = meta_df.dropDuplicates(["image_name"])

                                                                                

In [9]:
meta_final_df.show(n=3, truncate=100)

24/03/23 22:13:32 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+--------------------------------------------+------+----------+----------+--------+-------+-------------+--------+-----------------+------------------+---------------+-----------------------------+---------+------------+--------+---------+------------------------------------------------------------------------------+------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+
|                                  image_name| organ|species_id|    obs_id| license|partner|       author|altitude|         latitude|         longitude|gbif_species_id|                      species|    genus|      family| dataset|publisher|                                                                    references|                                                                     url|learn_tag|                                                                                    image_back

                                                                                

In [10]:
meta_final_df.count()

                                                                                

1408033

In [11]:
# Perform an inner join on the 'image_path' column
final_df = image_final_df.join(meta_final_df, "image_name", "inner").repartition(
    500, "species_id"
)

# Show the result to verify the join
final_df.show(n=3, truncate=100)

[Stage 23:>                                                         (0 + 1) / 1]

+--------------------------------------------+-------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+------+----------+----------+--------+-------+---------------------------+--------+-----------------+------------------+---------------+-------------------------+---------+-------------+--------+---------+------------------------------------------------------------------------------+------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+
|                                  image_name|                                                                     path|                                                                                                data| organ|species_id|    obs_id| license|partner|                     author|altitude|      

                                                                                

In [12]:
final_df.count()

                                                                                

1408033

In [13]:
# Print Schema
final_df.printSchema()

root
 |-- image_name: string (nullable = true)
 |-- path: string (nullable = true)
 |-- data: binary (nullable = true)
 |-- organ: string (nullable = true)
 |-- species_id: integer (nullable = true)
 |-- obs_id: long (nullable = true)
 |-- license: string (nullable = true)
 |-- partner: string (nullable = true)
 |-- author: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- gbif_species_id: string (nullable = true)
 |-- species: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- family: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- references: string (nullable = true)
 |-- url: string (nullable = true)
 |-- learn_tag: string (nullable = true)
 |-- image_backup_url: string (nullable = true)



In [14]:
spark.stop()