# Image to Embedding using DINOv2

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from plantclef.utils import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/31 18:23:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/31 18:23:53 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
# Get list of stored filed in cloud bucket
root = "gs://dsgt-clef-plantclef-2024"
! date
! gcloud storage ls {root}/data/parquet_files

Sun Mar 31 18:24:02 UTC 2024


gs://dsgt-clef-plantclef-2024/data/parquet_files/
gs://dsgt-clef-plantclef-2024/data/parquet_files/PlantCLEF2022_web_training_images_1/
gs://dsgt-clef-plantclef-2024/data/parquet_files/PlantCLEF2022_web_training_images_4/
gs://dsgt-clef-plantclef-2024/data/parquet_files/PlantCLEF2024_test/
gs://dsgt-clef-plantclef-2024/data/parquet_files/PlantCLEF2024_training/
gs://dsgt-clef-plantclef-2024/data/parquet_files/PlantCLEF2024_training_cropped_resized/
gs://dsgt-clef-plantclef-2024/data/parquet_files/PlantCLEF2024_training_cropped_resized_v2/


In [4]:
# Path and dataset names
gcs_path = "gs://dsgt-clef-plantclef-2024/data/parquet_files/"
train = "PlantCLEF2024_training_cropped_resized_v2"

# Define the GCS path to the Train parquet file
train_gcs_path = f"{gcs_path}{train}"

# Read the Parquet file into a DataFrame
train_df = spark.read.parquet(train_gcs_path)

# Show the data (for example, first few rows)
train_df.show(n=3)

                                                                                

+--------------------+--------------------+------+----------+----------+--------------------+-------+--------------------+--------+-------------+-------------+---------------+--------------------+----------+-------------+--------+-----------+--------------------+--------------------+---------+--------------------+--------------------+
|          image_name|                path| organ|species_id|    obs_id|             license|partner|              author|altitude|     latitude|    longitude|gbif_species_id|             species|     genus|       family| dataset|  publisher|          references|                 url|learn_tag|    image_backup_url|                data|
+--------------------+--------------------+------+----------+----------+--------------------+-------+--------------------+--------+-------------+-------------+---------------+--------------------+----------+-------------+--------+-----------+--------------------+--------------------+---------+--------------------+-----------

### pipeline

In [5]:
from pyspark.ml import Pipeline
from plantclef.transforms import WrappedDinoV2, DCTN

# Get subset of images to test pipeline
train10_df = train_df.limit(10).cache()

# Init DINOv2 wrapper
dino = WrappedDinoV2(input_col="data", output_col="transformed_data")

# Init Descrite Cosine Transform wrapper
dctn = DCTN(input_col="transformed_data", output_col="dctn_data")

# Create Pipeline
pipeline = Pipeline(stages=[dino, dctn])

# Fit pipeline to DF
model = pipeline.fit(train10_df)

# Apply the model to transform the DF
transformed_df = model.transform(train10_df).cache()

# Show results
transformed_df.show(n=10)

preprocessor_config.json: 100%|██████████| 436/436 [00:00<00:00, 2.07MB/s]) / 1]
config.json: 100%|██████████| 548/548 [00:00<00:00, 3.00MB/s]
model.safetensors: 100%|██████████| 346M/346M [00:01<00:00, 196MB/s]  
                                                                                

+--------------------+--------------------+------+----------+----------+--------------------+-------+--------------------+--------+-----------------+------------------+---------------+--------------------+-----------+-------------+--------+-----------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+
|          image_name|                path| organ|species_id|    obs_id|             license|partner|              author|altitude|         latitude|         longitude|gbif_species_id|             species|      genus|       family| dataset|  publisher|          references|                 url|learn_tag|    image_backup_url|                data|    transformed_data|           dctn_data|
+--------------------+--------------------+------+----------+----------+--------------------+-------+--------------------+--------+-----------------+------------------+---------------+--------------------+-----------+-----