# Centroid predictions

Classify test data using centroid probabilities.
We're using probabilities for the entire test image.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from plantclef.spark import get_spark

spark = get_spark(cores=4)
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/19 21:40:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/19 21:40:07 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/04/19 21:40:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
import os
from pathlib import Path

# Get list of stored filed in cloud bucket
root = Path(os.path.expanduser("~"))
! date

Sat Apr 19 09:40:09 PM EDT 2025


### Faiss centroid probabilities 

In [7]:
import numpy as np
import pandas as pd
from plantclef.config import get_class_mappings_file

# path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data"

# read the parquet files into a spark DataFrame
centroid_path = f"{data_path}/clustering/test_2025_centroid_euclidean_scores"
metadata_path = f"{data_path}/species_metadata.csv"

# read data
cent_df = pd.read_parquet(centroid_path)
species_meta_df = pd.read_csv(metadata_path)
display(cent_df.head(5))
display(species_meta_df.head(5))

# load species_ids in the correct order
class_mappings_file = get_class_mappings_file()
with open(class_mappings_file) as f:
    sorted_species_ids = np.array([int(line.strip()) for line in f])

Unnamed: 0,image_name,cos_probabilities,euclidean_score
0,CBN-Pla-A1-20190814.jpg,"[0.00013336474, 0.00013364569, 0.00013704262, ...","[0.00013831591, 0.00013568375, 0.0001403753, 0..."
1,CBN-Pla-D6-20190814.jpg,"[0.0001280635, 0.00012812452, 0.0001313897, 0....","[0.00013808624, 0.00013334032, 0.00014012698, ..."
2,CBN-PdlC-C5-20140901.jpg,"[0.0001356341, 0.0001306982, 0.00013256918, 0....","[0.00014182531, 0.00013441913, 0.00013885477, ..."
3,LISAH-BOU-0-37-20230512.jpg,"[0.00013228072, 0.00013178962, 0.00013464248, ...","[0.00014017329, 0.00013580783, 0.00014142484, ..."
4,CBN-Pla-E4-20130808.jpg,"[0.00012787317, 0.00013026498, 0.00012689614, ...","[0.00013487983, 0.00013341759, 0.00013364258, ..."


Unnamed: 0,species_id,species,genus,family
0,1355868,Lactuca virosa L.,Lactuca,Asteraceae
1,1355869,Crepis capillaris (L.) Wallr.,Crepis,Asteraceae
2,1355870,Crepis foetida L.,Crepis,Asteraceae
3,1355871,Hypochaeris glabra L.,Hypochaeris,Asteraceae
4,1355872,Hypochaeris radicata L.,Hypochaeris,Asteraceae


In [9]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def top_k_species(probabilities, top_k: int = 9):
    probs_tensor = torch.tensor(probabilities).to(device)
    top_probs, top_indices = torch.topk(probs_tensor, k=top_k)
    top_probs = top_probs.cpu().numpy()
    top_indices = top_indices.cpu().numpy()
    return [int(sorted_species_ids[i]) for i in top_indices]


# apply top-k per row
top_k_proba = 9
COL_NAME = "euclidean_score"  # "probabilities"

cent_df["species_ids"] = cent_df[COL_NAME].apply(
    lambda probs: top_k_species(probs, top_k=top_k_proba)
)
cent_df.head(5)

Unnamed: 0,image_name,cos_probabilities,euclidean_score,species_ids
0,CBN-Pla-A1-20190814.jpg,"[0.00013336474, 0.00013364569, 0.00013704262, ...","[0.00013831591, 0.00013568375, 0.0001403753, 0...","[1362331, 1394311, 1390899, 1390793, 1392407, ..."
1,CBN-Pla-D6-20190814.jpg,"[0.0001280635, 0.00012812452, 0.0001313897, 0....","[0.00013808624, 0.00013334032, 0.00014012698, ...","[1741880, 1395807, 1414270, 1397468, 1395870, ..."
2,CBN-PdlC-C5-20140901.jpg,"[0.0001356341, 0.0001306982, 0.00013256918, 0....","[0.00014182531, 0.00013441913, 0.00013885477, ...","[1741880, 1396869, 1392407, 1394311, 1393659, ..."
3,LISAH-BOU-0-37-20230512.jpg,"[0.00013228072, 0.00013178962, 0.00013464248, ...","[0.00014017329, 0.00013580783, 0.00014142484, ...","[1389810, 1721554, 1658529, 1373840, 1744168, ..."
4,CBN-Pla-E4-20130808.jpg,"[0.00012787317, 0.00013026498, 0.00012689614, ...","[0.00013487983, 0.00013341759, 0.00013364258, ...","[1390899, 1390910, 1390793, 1392407, 1362331, ..."


In [10]:
cent_df["species_ids"].iloc[0]

[1362331,
 1394311,
 1390899,
 1390793,
 1392407,
 1361914,
 1390910,
 1651363,
 1392323]

In [11]:
preds_df = cent_df[["image_name", "species_ids"]]
# rename image_name to quadrat_id
preds_df = preds_df.rename(columns={"image_name": "quadrat_id"})
preds_df.head()

Unnamed: 0,quadrat_id,species_ids
0,CBN-Pla-A1-20190814.jpg,"[1362331, 1394311, 1390899, 1390793, 1392407, ..."
1,CBN-Pla-D6-20190814.jpg,"[1741880, 1395807, 1414270, 1397468, 1395870, ..."
2,CBN-PdlC-C5-20140901.jpg,"[1741880, 1396869, 1392407, 1394311, 1393659, ..."
3,LISAH-BOU-0-37-20230512.jpg,"[1389810, 1721554, 1658529, 1373840, 1744168, ..."
4,CBN-Pla-E4-20130808.jpg,"[1390899, 1390910, 1390793, 1392407, 1362331, ..."


In [12]:
def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(
    pandas_df: pd.DataFrame, col: str = "image_name"
) -> pd.DataFrame:
    """Formats the Pandas DataFrame, and writes to PACE."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append({"quadrat_id": row[col], "species_ids": formatted_species})

    pandas_df = pd.DataFrame(records)
    # remove .jpg from quadrat_id in final_df
    pandas_df["quadrat_id"] = pandas_df["quadrat_id"].str.replace(
        ".jpg", "", regex=False
    )
    # sort by quadrat_id
    pandas_df = pandas_df.sort_values(by=["quadrat_id"])

    return pandas_df


final_df = prepare_and_write_submission(preds_df, col="quadrat_id")
final_df.head(10)

Unnamed: 0,quadrat_id,species_ids
2036,2024-CEV3-20240602,"[1450109, 1617142, 1392662, 1389859, 1390795, ..."
912,CBN-PdlC-A1-20130807,"[1392407, 1412857, 1529289, 1390793, 1362331, ..."
634,CBN-PdlC-A1-20130903,"[1392407, 1412857, 1390793, 1362331, 1391331, ..."
552,CBN-PdlC-A1-20140721,"[1529289, 1392407, 1412857, 1392323, 1362331, ..."
1477,CBN-PdlC-A1-20140811,"[1529289, 1392407, 1412857, 1362331, 1390910, ..."
1587,CBN-PdlC-A1-20140901,"[1392407, 1390793, 1362331, 1529289, 1412857, ..."
67,CBN-PdlC-A1-20150701,"[1392323, 1529289, 1392407, 1390910, 1743235, ..."
688,CBN-PdlC-A1-20150720,"[1412857, 1392407, 1529289, 1742053, 1357575, ..."
1629,CBN-PdlC-A1-20150831,"[1412857, 1390793, 1742053, 1742052, 1392407, ..."
189,CBN-PdlC-A1-20160705,"[1392407, 1412857, 1529289, 1392323, 1390904, ..."


In [13]:
len(final_df), final_df.shape, final_df.columns

(2105, (2105, 2), Index(['quadrat_id', 'species_ids'], dtype='object'))

In [16]:
import csv


def get_plantclef_dir() -> str:
    home_dir = Path(os.path.expanduser("~"))
    return f"{home_dir}/p-dsgt_clef2025-0/shared/plantclef"


def write_csv_to_pace(df, file_name: str, col: str = "quadrat_id") -> None:
    """Writes the Pandas DataFrame to a CSV file on PACE."""

    # prepare and write the submission
    submission_df = prepare_and_write_submission(df, col)
    project_dir = get_plantclef_dir()
    submission_path = f"{project_dir}/submissions/centroids"
    output_path = f"{submission_path}/{file_name}"
    # ensure directory exists before saving
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # write to CSV
    submission_df.to_csv(output_path, sep=",", index=False, quoting=csv.QUOTE_ALL)
    print(f"Submission file saved to: {output_path}")


file_name = f"dsgt_centroid_euclidean_scores_topk{top_k_proba}.csv"
write_csv_to_pace(preds_df, file_name)

Submission file saved to: /storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/plantclef/submissions/centroids/dsgt_centroid_euclidean_scores_topk9.csv


In [17]:
submission_path = (
    f"~/p-dsgt_clef2025-0/shared/plantclef/submissions/centroids/{file_name}"
)
df = pd.read_csv(submission_path)
df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1450109, 1617142, 1392662, 1389859, 1390795, ..."
1,CBN-PdlC-A1-20130807,"[1392407, 1412857, 1529289, 1390793, 1362331, ..."
2,CBN-PdlC-A1-20130903,"[1392407, 1412857, 1390793, 1362331, 1391331, ..."
3,CBN-PdlC-A1-20140721,"[1529289, 1392407, 1412857, 1392323, 1362331, ..."
4,CBN-PdlC-A1-20140811,"[1529289, 1392407, 1412857, 1362331, 1390910, ..."
5,CBN-PdlC-A1-20140901,"[1392407, 1390793, 1362331, 1529289, 1412857, ..."
6,CBN-PdlC-A1-20150701,"[1392323, 1529289, 1392407, 1390910, 1743235, ..."
7,CBN-PdlC-A1-20150720,"[1412857, 1392407, 1529289, 1742053, 1357575, ..."
8,CBN-PdlC-A1-20150831,"[1412857, 1390793, 1742053, 1742052, 1392407, ..."
9,CBN-PdlC-A1-20160705,"[1392407, 1412857, 1529289, 1392323, 1390904, ..."
