# Centroid predictions

Classify test data using centroid probabilities.
We're using probabilities for the entire test image.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from plantclef.spark import get_spark

spark = get_spark(cores=4)
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/20 22:07:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/20 22:07:37 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/04/20 22:07:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/20 22:07:37 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
import os
from pathlib import Path

# Get list of stored filed in cloud bucket
root = Path(os.path.expanduser("~"))
! date

Sun Apr 20 10:07:39 PM EDT 2025


### Faiss centroid probabilities 

In [4]:
import numpy as np
import pandas as pd
from plantclef.config import get_class_mappings_file

# path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data"

# read the parquet files into a spark DataFrame
centroid_path = f"{data_path}/prior/test_2025_tile_prior_probabilities"
metadata_path = f"{data_path}/species_metadata.csv"

# read data
cent_df = pd.read_parquet(centroid_path)
species_meta_df = pd.read_csv(metadata_path)
display(cent_df.head(5))
display(species_meta_df.head(5))

# load species_ids in the correct order
class_mappings_file = get_class_mappings_file()
with open(class_mappings_file) as f:
    sorted_species_ids = np.array([int(line.strip()) for line in f])

Unnamed: 0,image_name,prior_probabilities
0,2024-CEV3-20240602.jpg,"[0.00013247246, 0.0001367882, 0.00013204469, 0..."
1,2024-CEV3-20240602.jpg,"[0.00013578957, 0.0001351673, 0.0001316694, 0...."
2,2024-CEV3-20240602.jpg,"[0.00013204673, 0.00013252249, 0.00013235239, ..."
3,2024-CEV3-20240602.jpg,"[0.00013307533, 0.00013671836, 0.00013264263, ..."
4,2024-CEV3-20240602.jpg,"[0.00013457047, 0.00013656596, 0.00013702415, ..."


Unnamed: 0,species_id,species,genus,family
0,1355868,Lactuca virosa L.,Lactuca,Asteraceae
1,1355869,Crepis capillaris (L.) Wallr.,Crepis,Asteraceae
2,1355870,Crepis foetida L.,Crepis,Asteraceae
3,1355871,Hypochaeris glabra L.,Hypochaeris,Asteraceae
4,1355872,Hypochaeris radicata L.,Hypochaeris,Asteraceae


In [5]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def top_k_species(probabilities, top_k: int = 9):
    probs_tensor = torch.tensor(probabilities).to(device)
    top_probs, top_indices = torch.topk(probs_tensor, k=top_k)
    top_probs = top_probs.cpu().numpy()
    top_indices = top_indices.cpu().numpy()
    return [int(sorted_species_ids[i]) for i in top_indices]


# apply top-k per row
top_k_proba = 10
COL_NAME = "prior_probabilities"  # "probabilities"

cent_df["species_ids"] = cent_df[COL_NAME].apply(
    lambda probs: top_k_species(probs, top_k=top_k_proba)
)
cent_df.head(5)

Unnamed: 0,image_name,prior_probabilities,species_ids
0,2024-CEV3-20240602.jpg,"[0.00013247246, 0.0001367882, 0.00013204469, 0...","[1395100, 1360187, 1664494, 1360208, 1397879, ..."
1,2024-CEV3-20240602.jpg,"[0.00013578957, 0.0001351673, 0.0001316694, 0....","[1360187, 1395100, 1360208, 1664494, 1360223, ..."
2,2024-CEV3-20240602.jpg,"[0.00013204673, 0.00013252249, 0.00013235239, ...","[1450109, 1390795, 1360763, 1360203, 1360187, ..."
3,2024-CEV3-20240602.jpg,"[0.00013307533, 0.00013671836, 0.00013264263, ...","[1395135, 1395063, 1360208, 1395100, 1397879, ..."
4,2024-CEV3-20240602.jpg,"[0.00013457047, 0.00013656596, 0.00013702415, ...","[1393933, 1360763, 1400181, 1361604, 1361587, ..."


In [6]:
display(cent_df["species_ids"].iloc[0])
display(cent_df["species_ids"].iloc[1])

[1395100,
 1360187,
 1664494,
 1360208,
 1397879,
 1395063,
 1395101,
 1395047,
 1360193,
 1395089]

[1360187,
 1395100,
 1360208,
 1664494,
 1360223,
 1395101,
 1360193,
 1397879,
 1362363,
 1360203]

### top-K species in grid of tiles

In [7]:
def aggregate_top_k_species(df: pd.DataFrame, top_k: int = 10) -> pd.DataFrame:
    def process_group(group: pd.DataFrame):
        results = []
        for _, row in group.iterrows():
            species_ids = row["species_ids"]
            prior_probs = row["prior_probabilities"]
            result = [
                {
                    species_id: prior
                    for species_id, prior in zip(species_ids, prior_probs)
                }
            ]
            results.append(result)

        flattened_results = [item for tile in results for item in tile[:top_k]]
        sorted_preds = sorted(flattened_results, key=lambda x: -list(x.values())[0])

        return pd.Series({"species_ids": sorted_preds})

    grouped = df.groupby("image_name")
    return grouped.apply(
        lambda g: process_group(g.drop(columns="image_name"))
    ).reset_index()

In [8]:
topk_df = aggregate_top_k_species(cent_df, top_k=top_k_proba)
display(topk_df.head(5))

  return grouped.apply(


Unnamed: 0,image_name,species_ids
0,2024-CEV3-20240602.jpg,"[{1392662: 0.00013580674, 1362443: 0.000128382..."
1,CBN-PdlC-A1-20130807.jpg,"[{1361273: 0.00014881953, 1392407: 0.000137022..."
2,CBN-PdlC-A1-20130903.jpg,"[{1741880: 0.00014772902, 1396869: 0.000138318..."
3,CBN-PdlC-A1-20140721.jpg,"[{1392565: 0.00014735751, 1391331: 0.000142343..."
4,CBN-PdlC-A1-20140811.jpg,"[{1392407: 0.00014176566, 1742053: 0.000138991..."


In [9]:
def extract_top_k_species(logits: list, top_k: int = 9) -> list:
    """Extracts the top k species from the logits list."""
    top_logits = [list(item.keys())[0] for item in logits[:top_k]]
    set_logits = sorted(set(top_logits), key=top_logits.index)
    return set_logits


topk_df["species_ids"] = topk_df["species_ids"].apply(
    lambda x: extract_top_k_species(x, top_k=top_k_proba)
)
topk_df.head(5)

Unnamed: 0,image_name,species_ids
0,2024-CEV3-20240602.jpg,"[1392662, 1360187, 1363328, 1738679, 1393933, ..."
1,CBN-PdlC-A1-20130807.jpg,"[1361273, 1392407, 1392565, 1742052, 1743605, ..."
2,CBN-PdlC-A1-20130903.jpg,"[1741880, 1412857, 1392407, 1742052, 1395974, ..."
3,CBN-PdlC-A1-20140721.jpg,"[1392565, 1412857, 1743605, 1392407, 1741880]"
4,CBN-PdlC-A1-20140811.jpg,"[1392407, 1412857, 1395974, 1395807, 1667408]"


In [10]:
# preds_df = cent_df[["image_name", "species_ids"]]
preds_df = topk_df[["image_name", "species_ids"]]
# rename image_name to quadrat_id
preds_df = preds_df.rename(columns={"image_name": "quadrat_id"})
preds_df.head()

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602.jpg,"[1392662, 1360187, 1363328, 1738679, 1393933, ..."
1,CBN-PdlC-A1-20130807.jpg,"[1361273, 1392407, 1392565, 1742052, 1743605, ..."
2,CBN-PdlC-A1-20130903.jpg,"[1741880, 1412857, 1392407, 1742052, 1395974, ..."
3,CBN-PdlC-A1-20140721.jpg,"[1392565, 1412857, 1743605, 1392407, 1741880]"
4,CBN-PdlC-A1-20140811.jpg,"[1392407, 1412857, 1395974, 1395807, 1667408]"


In [11]:
def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(
    pandas_df: pd.DataFrame,
    col: str = "image_name",
) -> pd.DataFrame:
    """Formats the Pandas DataFrame, and writes to PACE."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append({"quadrat_id": row[col], "species_ids": formatted_species})

    pandas_df = pd.DataFrame(records)
    # remove .jpg from quadrat_id in final_df
    pandas_df["quadrat_id"] = pandas_df["quadrat_id"].str.replace(
        ".jpg", "", regex=False
    )
    # sort by quadrat_id
    pandas_df = pandas_df.sort_values(by=["quadrat_id"])

    return pandas_df


final_df = prepare_and_write_submission(preds_df, col="quadrat_id")
final_df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1392662, 1360187, 1363328, 1738679, 1393933, ..."
1,CBN-PdlC-A1-20130807,"[1361273, 1392407, 1392565, 1742052, 1743605, ..."
2,CBN-PdlC-A1-20130903,"[1741880, 1412857, 1392407, 1742052, 1395974, ..."
3,CBN-PdlC-A1-20140721,"[1392565, 1412857, 1743605, 1392407, 1741880]"
4,CBN-PdlC-A1-20140811,"[1392407, 1412857, 1395974, 1395807, 1667408]"
5,CBN-PdlC-A1-20140901,"[1395974, 1392407, 1742052, 1394671, 1390793, ..."
6,CBN-PdlC-A1-20150701,"[1392407, 1391207, 1392565, 1741880, 1412857, ..."
7,CBN-PdlC-A1-20150720,"[1549015, 1412857, 1396869, 1395807, 1391499, ..."
8,CBN-PdlC-A1-20150831,"[1412857, 1742053, 1742052, 1396294, 1395807, ..."
9,CBN-PdlC-A1-20160705,"[1392407, 1412857, 1391331, 1397468, 1361284, ..."


In [12]:
len(final_df), final_df.shape, final_df.columns

(2105, (2105, 2), Index(['quadrat_id', 'species_ids'], dtype='object'))

In [13]:
import csv


def get_plantclef_dir() -> str:
    home_dir = Path(os.path.expanduser("~"))
    return f"{home_dir}/p-dsgt_clef2025-0/shared/plantclef"


def write_csv_to_pace(df, file_name: str, col: str = "quadrat_id") -> None:
    """Writes the Pandas DataFrame to a CSV file on PACE."""

    # prepare and write the submission
    submission_df = prepare_and_write_submission(df, col)
    project_dir = get_plantclef_dir()
    submission_path = f"{project_dir}/submissions/centroids"
    output_path = f"{submission_path}/{file_name}"
    # ensure directory exists before saving
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # write to CSV
    submission_df.to_csv(output_path, sep=",", index=False, quoting=csv.QUOTE_ALL)
    print(f"Submission file saved to: {output_path}")


file_name = f"dsgt_tile_grid_4x4_euclidean_scores_topk{top_k_proba}.csv"
write_csv_to_pace(preds_df, file_name)

Submission file saved to: /storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/plantclef/submissions/centroids/dsgt_tile_grid_4x4_euclidean_scores_topk10.csv


In [14]:
submission_path = (
    f"~/p-dsgt_clef2025-0/shared/plantclef/submissions/centroids/{file_name}"
)
df = pd.read_csv(submission_path)
df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1392662, 1360187, 1363328, 1738679, 1393933, ..."
1,CBN-PdlC-A1-20130807,"[1361273, 1392407, 1392565, 1742052, 1743605, ..."
2,CBN-PdlC-A1-20130903,"[1741880, 1412857, 1392407, 1742052, 1395974, ..."
3,CBN-PdlC-A1-20140721,"[1392565, 1412857, 1743605, 1392407, 1741880]"
4,CBN-PdlC-A1-20140811,"[1392407, 1412857, 1395974, 1395807, 1667408]"
5,CBN-PdlC-A1-20140901,"[1395974, 1392407, 1742052, 1394671, 1390793, ..."
6,CBN-PdlC-A1-20150701,"[1392407, 1391207, 1392565, 1741880, 1412857, ..."
7,CBN-PdlC-A1-20150720,"[1549015, 1412857, 1396869, 1395807, 1391499, ..."
8,CBN-PdlC-A1-20150831,"[1412857, 1742053, 1742052, 1396294, 1395807, ..."
9,CBN-PdlC-A1-20160705,"[1392407, 1412857, 1391331, 1397468, 1361284, ..."


In [15]:
df.shape, df.columns

((2105, 2), Index(['quadrat_id', 'species_ids'], dtype='object'))