## Image to Parquet

In [1]:
%load_ext autoreload
%autoreload 2

### testing spark setup

In [2]:
# let's check that the initial data from murilo is theres
! gcloud storage ls gs://dsgt-clef-plantclef-2024/raw/ | head

gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_1.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_2.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_3.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_4.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.tar
gs://dsgt-clef-plantclef-2024/raw/args.yaml
gs://dsgt-clef-plantclef-2024/raw/class_mapping.txt


In [3]:
import os
from pathlib import Path
from pyspark.sql.functions import regexp_replace, split, element_at
from pyspark.sql import Row
from plantclef.utils import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/23 16:42:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/23 16:42:14 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
# Base directory using pathlib
curr_dir = Path(os.getcwd())
base_dir = curr_dir.parents[1] / "data" / "PlantCLEF2024"
base_dir

PosixPath('/home/mgustine/plantclef-2024/data/PlantCLEF2024')

In [5]:
# Load all files from the base directory as binary data
# Convert Path object to string when passing to PySpark
image_df = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.jpg")
    .option("recursiveFileLookup", "true")
    .load(base_dir.as_posix())
)

# Construct the string to be replaced - adjust this based on your actual base path
to_remove = "file:" + str(base_dir.parents[0])

# Remove "file:{base_dir.parents[0]" from path column
image_df = image_df.withColumn("path", regexp_replace("path", to_remove, ""))

# Split the path into an array of elements
split_path = split(image_df["path"], "/")

# Extract metadata from the file path
image_final_df = image_df.withColumn("file_name", element_at(split_path, -1))

# Select and rename columns to fit the target schema, including renaming 'content' to 'data'
image_final_df = image_final_df.select(
    "path",
    "file_name",
    image_final_df["content"].alias("data"),
)

# Create a new column "image_path" by removing "/images/" from "path"
image_final_df = image_final_df.withColumn(
    "image_path", regexp_replace("path", f"^/{base_dir.parts[-1]}/", "")
)

# Print Schema
image_final_df.printSchema()

24/03/23 16:43:37 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.


root
 |-- path: string (nullable = true)
 |-- file_name: string (nullable = true)
 |-- data: binary (nullable = true)
 |-- image_path: string (nullable = true)



In [6]:
image_final_df.show(n=3)

                                                                                

+--------------------+--------------------+--------------------+--------------------+
|                path|           file_name|                data|          image_path|
+--------------------+--------------------+--------------------+--------------------+
|/PlantCLEF2024/tr...|2a286d5d4daa2daf4...|[FF D8 FF E0 00 1...|train/1363945/2a2...|
|/PlantCLEF2024/tr...|11ca311532b09f32e...|[FF D8 FF E0 00 1...|train/1390138/11c...|
|/PlantCLEF2024/tr...|cf66eb2121e182743...|[FF D8 FF E0 00 1...|train/1360943/cf6...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



### join image_df with metadata from GCS

In [7]:
# Get list of stored files in cloud bucket
! gcloud storage ls gs://dsgt-clef-plantclef-2024/raw

gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_1.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_2.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_3.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_4.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.tar
gs://dsgt-clef-plantclef-2024/raw/args.yaml
gs://dsgt-clef-plantclef-2024/raw/class_mapping.txt
gs://dsgt-clef-plantclef-2024/raw/index.html
gs://dsgt-clef-plantclef-2024/raw/model_best.pth.tar
gs://dsgt-clef-plantclef-2024/raw/readme.txt
gs://dsgt-clef-plantclef-2024/raw/summary.csv
gs://dsgt-clef-plantclef-2024/raw/urls.txt


In [9]:
raw_root = "gs://dsgt-clef-plantclef-2024/raw/"
meta_dataset_name = "PlantCLEF2024singleplanttrainingdata.csv"

# Read the PlantCLEF 2022 metadata CSV file
meta_df = spark.read.csv(
    f"{raw_root}/{meta_dataset_name}",
    header=True,
    inferSchema=True,
    sep=";",  # specify semicolon as delimiter
)

# Cache the DataFrame to optimize subsequent operations
meta_df.cache()

# Drop duplicate entries based on 'image_path' before the join
meta_final_df = meta_df  # .dropDuplicates(["image_path"])

24/03/23 16:46:03 WARN CacheManager: Asked to cache already cached data.        


DataFrame[image_name: string, organ: string, species_id: int, obs_id: bigint, license: string, partner: string, author: string, altitude: double, latitude: double, longitude: double, gbif_species_id: string, species: string, genus: string, family: string, dataset: string, publisher: string, references: string, url: string, learn_tag: string, image_backup_url: string]

In [10]:
meta_final_df.show(n=3, truncate=100)

NameError: name 'meta_final_df' is not defined

In [None]:
meta_final_df.count()

In [None]:
# Perform an inner join on the 'image_path' column
final_df = image_final_df.join(meta_final_df, "image_path", "inner")

# Show the result to verify the join
final_df.show(n=3, truncate=100)

In [None]:
final_df.count()

In [None]:
# Print Schema
final_df.printSchema()