# Centroid grid predictions

Classify test data using centroid probabilities for grid of tiles.
We're using probabilities for the entire test image.

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from plantclef.spark import get_spark

spark = get_spark(cores=4)
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/20 15:54:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/20 15:54:49 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/04/20 15:54:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
import os
from pathlib import Path

# Get list of stored filed in cloud bucket
root = Path(os.path.expanduser("~"))
! date

Sun Apr 20 03:54:51 PM EDT 2025


### Faiss centroid probabilities 

In [5]:
# Path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data/embeddings"


# Define the path to the train and test parquet files
def get_faiss_embed_path(num_centroids: int = 10):
    return f"{data_path}/train_centroids/num_centroids={num_centroids}"


# Read the parquet files into a spark DataFrame
faiss10_df = spark.read.parquet(get_faiss_embed_path(10))
faiss20_df = spark.read.parquet(get_faiss_embed_path(20))
faiss50_df = spark.read.parquet(get_faiss_embed_path(50))

# Show the data
faiss10_df.printSchema()
faiss10_df.show(n=5)

                                                                                

root
 |-- centroid_id: integer (nullable = true)
 |-- species_id: integer (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)



                                                                                

+-----------+----------+--------------------+
|centroid_id|species_id|           embedding|
+-----------+----------+--------------------+
|          6|   1398243|[1.0948665, -0.35...|
|          7|   1647175|[-0.30202305, 0.7...|
|          3|   1360020|[0.049416766, 0.9...|
|          3|   1361527|[0.3046188, 0.885...|
|          8|   1359277|[-0.09973065, 0.6...|
+-----------+----------+--------------------+
only showing top 5 rows



In [6]:
test_path = f"{data_path}/test_2025/test_2025_embed_logits"
test_df = spark.read.parquet(test_path)
test_df.printSchema()

root
 |-- image_name: string (nullable = true)
 |-- output: struct (nullable = true)
 |    |-- cls_token: array (nullable = true)
 |    |    |-- element: float (containsNull = true)
 |    |-- logits: array (nullable = true)
 |    |    |-- element: float (containsNull = true)
 |-- sample_id: integer (nullable = true)



In [8]:
import numpy as np
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, FloatType


def avg_embeddings_udf(embeddings):
    array = np.array(embeddings)
    mean_array = np.mean(array, axis=0)
    return mean_array.tolist()


average_embeddings = F.udf(avg_embeddings_udf, ArrayType(FloatType()))

# group and apply the UDF
avg_embeddings_df = (
    faiss10_df.groupBy("species_id")
    .agg(F.collect_list("embedding").alias("embedding_list"))
    .withColumn("avg_embeddings", average_embeddings(F.col("embedding_list")))
)
avg_embeddings_df.printSchema()
avg_embeddings_df.show(n=10, truncate=50)

root
 |-- species_id: integer (nullable = true)
 |-- embedding_list: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: float (containsNull = true)
 |-- avg_embeddings: array (nullable = true)
 |    |-- element: float (containsNull = true)





+----------+--------------------------------------------------+--------------------------------------------------+
|species_id|                                    embedding_list|                                    avg_embeddings|
+----------+--------------------------------------------------+--------------------------------------------------+
|   1355869|[[0.0038829704, -0.013802218, 0.009791469, 0.36...|[0.114047855, 0.4057617, -0.06062866, 0.0549097...|
|   1355870|[[0.18335515, -0.08915622, -0.29222322, 0.81695...|[0.11070547, 0.28042978, -0.69385886, 0.3224701...|
|   1355872|[[0.28893483, 0.20109488, 0.24363865, 0.2530085...|[0.19194208, 0.1617909, -0.013561882, 0.1457786...|
|   1355881|[[0.23531002, -0.26157096, -0.017927974, -1.038...|[0.20774464, -0.2938113, 0.026279427, -0.838810...|
|   1355900|[[1.0873219, -0.07327607, -0.29321098, 0.071418...|[0.5382732, 0.15868045, -0.116654, 0.23893037, ...|
|   1355953|[[-0.4760781, 1.7744097, -1.1470047, -0.3837139...|[-0.016290855, 0.

                                                                                

### classifier-based probabilities

Calculate probabilities based on embedding distances

In [10]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.special import softmax
from plantclef.config import get_class_mappings_file

# load class mappings
class_mappings_file = get_class_mappings_file()
with open(class_mappings_file) as f:
    sorted_species_ids = [int(line.strip()) for line in f]

# get (species_id, avg_embeddings) from Spark
centroids_pd = avg_embeddings_df.select("species_id", "avg_embeddings").toPandas()

# filter + reorder centroids to match sorted_species_ids
centroids_dict = dict(zip(centroids_pd["species_id"], centroids_pd["avg_embeddings"]))
filtered_embeddings = [
    centroids_dict[species_id]
    if species_id in centroids_dict
    else np.zeros_like(next(iter(centroids_dict.values())))
    for species_id in sorted_species_ids
]

# shape: (num_species, embedding_dim)
train_embeddings = np.stack(filtered_embeddings)
train_embeddings.shape

                                                                                

(7806, 768)

In [None]:
# get test embeddings and image names
test_pd = test_df.select("image_name", "output.cls_token").toPandas()
test_embeddings = np.stack(test_pd["cls_token"].values)
image_names = test_pd["image_name"].values

# compute cosine similarity and softmax
cos_similarities = cosine_similarity(test_embeddings, train_embeddings)
eucliden_dist = euclidean_distances(test_embeddings, train_embeddings)
euclidean_score = (1 / (eucliden_dist**2)) / np.sum(
    1 / (eucliden_dist**2), axis=1, keepdims=True
)  # normalize to sum to 1
cos_probabilities = softmax(cos_similarities, axis=1)  # shape: (num_test, num_species)

# create final DataFrame with aligned probabilities
final_df = pd.DataFrame(
    {
        "image_name": image_names,
        "cos_probabilities": list(cos_probabilities),
        "euclidean_score": list(euclidean_score),
    }
)
final_df.head()