# Test to Parquet pipeline

In [1]:
%load_ext autoreload
%autoreload 2

### testing spark setup

In [2]:
# let's check that the initial data from murilo is theres
! gcloud storage ls gs://dsgt-clef-plantclef-2024/raw/ | head

gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_1.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_2.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_3.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_images_4.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2022_web_training_metadata.tar.gz
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.csv
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024singleplanttrainingdata.tar
gs://dsgt-clef-plantclef-2024/raw/PlantCLEF2024test.tar
gs://dsgt-clef-plantclef-2024/raw/args.yaml


In [3]:
import os
from pathlib import Path
from pyspark.sql import functions as F
from plantclef.utils import get_spark

spark = get_spark(cores=4, memory="12g")
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/31 14:59:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/31 14:59:40 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
# Base directory using pathlib
curr_dir = Path(os.getcwd())
base_dir = curr_dir.parents[1] / "data" / "PlantCLEF2024test"
base_dir

PosixPath('/home/mgustine/plantclef-2024/data/PlantCLEF2024test')

In [5]:
# Load all files from the base directory as binary data
# Convert Path object to string when passing to PySpark
image_df = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.jpg")
    .option("recursiveFileLookup", "true")
    .load(base_dir.as_posix())
)

# Construct the string to be replaced - adjust this based on your actual base path
to_remove = "file:" + str(base_dir.parents[0])

# Remove "file:{base_dir.parents[0]" from path column
image_df = image_df.withColumn("path", F.regexp_replace("path", to_remove, ""))

# Split the path into an array of elements
split_path = F.split(image_df["path"], "/")

# Select and rename columns to fit the target schema, including renaming 'content' to 'data'
image_final_df = image_df.select(
    "path",
    F.element_at(split_path, -1).alias("image_name"),
    F.col("content").alias("data"),
)

# Print Schema
image_final_df.printSchema()

root
 |-- path: string (nullable = true)
 |-- image_name: string (nullable = true)
 |-- data: binary (nullable = true)



In [6]:
image_final_df.show(n=10, truncate=100)

                                                                                

+-------------------------------------------+------------------------+----------------------------------------------------------------------------------------------------+
|                                       path|              image_name|                                                                                                data|
+-------------------------------------------+------------------------+----------------------------------------------------------------------------------------------------+
| /PlantCLEF2024test/CBN-Pla-D4-20140722.jpg| CBN-Pla-D4-20140722.jpg|[FF D8 FF E0 00 10 4A 46 49 46 00 01 01 01 00 48 00 48 00 00 FF E1 28 CE 45 78 69 66 00 00 49 49 ...|
| /PlantCLEF2024test/CBN-Pla-E5-20130904.jpg| CBN-Pla-E5-20130904.jpg|[FF D8 FF E0 00 10 4A 46 49 46 00 01 01 01 00 48 00 48 00 00 FF E1 28 CE 45 78 69 66 00 00 49 49 ...|
|/PlantCLEF2024test/CBN-PdlC-D2-20130903.jpg|CBN-PdlC-D2-20130903.jpg|[FF D8 FF E0 00 10 4A 46 49 46 00 01 01 01 00 48 00 48 00 00 FF E1 28 