# Non-Maximum Supression Submission

Get the dataframe `detection/inference_outputs` with the probabilities from the detections and make a submission

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from plantclef.spark import get_spark

spark = get_spark(cores=4)
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/04 22:50:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/04 22:50:13 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
import os
from pathlib import Path

# Get list of stored filed in cloud bucket
root = Path(os.path.expanduser("~"))
! date

Sun May  4 10:50:16 PM EDT 2025


### Grounding DINO NMS probabilities

In [4]:
# Path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data"
inference_path = f"{data_path}/detection/inference_outputs"
df = spark.read.parquet(inference_path)
df.printSchema()
df.show(n=2, vertical=True, truncate=80)

root
 |-- image_name: string (nullable = true)
 |-- probabilities: array (nullable = true)
 |    |-- element: double (containsNull = true)



                                                                                

-RECORD 0-----------------------------------------------------------------------------------------
 image_name    | CBN-Pla-C6-20200814.jpg                                                          
 probabilities | [2.406519342912361E-5, 1.1829448339994997E-5, 4.4492684537544847E-5, 7.187668... 
-RECORD 1-----------------------------------------------------------------------------------------
 image_name    | GUARDEN-CBNMed-19-5-15-44-20240429.jpg                                           
 probabilities | [3.263013422838412E-5, 3.4224908631586004E-6, 4.886940587311983E-5, 1.0735976... 
only showing top 2 rows



In [5]:
df.count()

17053

In [6]:
import numpy as np
from plantclef.config import get_class_mappings_file


def load_class_mapping(class_mapping_file=None):
    with open(class_mapping_file) as f:
        class_index_to_class_name = {i: line.strip() for i, line in enumerate(f)}
    return class_index_to_class_name


# load class mappings
class_mapping_file = get_class_mappings_file()
cid_to_spid = load_class_mapping(class_mapping_file)


# get top-K predictions for each row
def get_top_n_predictions(probabilities: list, n=5) -> list[int]:
    proba_arr = np.array(probabilities)
    top_n_indices = proba_arr.argsort()[-n:][::-1]  # fastest way to get top n indices
    return [cid_to_spid[i] for i in top_n_indices]

In [7]:
pandas_df = df.toPandas()

top_k = 1
pandas_df["species_ids"] = pandas_df["probabilities"].apply(
    lambda proba: get_top_n_predictions(proba, n=top_k)
)

                                                                                

In [8]:
pandas_df.head(10)

Unnamed: 0,image_name,probabilities,species_ids
0,CBN-Pla-C6-20200814.jpg,"[2.406519342912361e-05, 1.1829448339994997e-05...",[1390793]
1,GUARDEN-CBNMed-19-5-15-44-20240429.jpg,"[3.263013422838412e-05, 3.4224908631586004e-06...",[1396063]
2,CBN-Pla-C5-20140902.jpg,"[4.0722519770497456e-05, 2.578282328613568e-05...",[1394311]
3,CBN-Pla-C2-20180906.jpg,"[3.5486275464791106e-06, 8.165693543560337e-06...",[1361281]
4,CBN-PdlC-D6-20150701.jpg,"[1.8766439097817056e-05, 1.579140189278405e-05...",[1393679]
5,GUARDEN-CBNMed-14-4-9-49-20240429.jpg,"[0.0011196996783837676, 0.00026946881553158164...",[1741834]
6,CBN-PdlC-F2-20190909.jpg,"[2.3464181140298024e-06, 7.250819180626422e-06...",[1742052]
7,CBN-PdlC-E1-20140901.jpg,"[5.5319997045444325e-05, 1.1587068001972511e-0...",[1394311]
8,CBN-Pla-A4-20190814.jpg,"[8.435828021902125e-06, 1.4364024536916986e-05...",[1414270]
9,CBN-PdlC-F5-20140630.jpg,"[2.0208362911944278e-06, 4.054568307765294e-06...",[1664563]


### group predictions by `image_name`

In [None]:
import pandas as pd
from itertools import chain
from collections import Counter


def aggregate_predictions(group, k: int = 10):
    # flatten the list of predictions across duplicate rows
    all_preds = list(chain.from_iterable(group["species_ids"]))
    # count and get top-k predictions
    top_preds = [spid for spid, _ in Counter(all_preds).most_common(k)]
    return pd.Series({"species_ids": top_preds})


# group by image_name and aggregate predictions
TOP_K = 10
aggregated_df = (
    pandas_df.groupby("image_name")
    .apply(lambda group: aggregate_predictions(group, k=TOP_K))
    .reset_index()
)
aggregated_df = aggregated_df.rename(columns={"image_name": "quadrat_id"})

aggregated_df.head()

  .apply(lambda group: aggregate_predictions(group, k=TOP_K))


Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602.jpg,"[1360187, 1395111, 1450109, 1743159, 1398690, ..."
1,CBN-PdlC-A1-20130807.jpg,"[1392608, 1742100, 1392407, 1412857, 1362331, ..."
2,CBN-PdlC-A1-20130903.jpg,"[1392608, 1397468, 1412857, 1742052, 1394311, ..."
3,CBN-PdlC-A1-20140721.jpg,"[1394911, 1412857, 1361389, 1397449, 1654153, ..."
4,CBN-PdlC-A1-20140811.jpg,"[1392608, 1412857, 1394911, 1395807]"


: 

In [None]:
# read test data
test_path = f"{data_path}/parquet/test_2025"
test_df = spark.read.parquet(test_path).toPandas()
test_df = test_df.rename(columns={"image_name": "quadrat_id"})
test_df.head(3)



In [None]:
# outer join with aggregated_df to fill in missing quadrat_ids
merged_df = pd.merge(
    test_df[["quadrat_id"]],
    aggregated_df,
    on="quadrat_id",
    how="outer",
)
final_df = merged_df[["quadrat_id", "species_ids"]]
# fill NaN values with empty lists
final_df["species_ids"] = final_df["species_ids"].apply(
    lambda x: x if isinstance(x, list) else []
)
# remove .jpg extension from quadrat_id
final_df["quadrat_id"] = final_df["quadrat_id"].str.replace(".jpg", "", regex=False)
final_df.head(5)

In [None]:
len(aggregated_df), len(final_df)

In [None]:
import csv
from pathlib import Path


def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids if id is not None)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(pandas_df: pd.DataFrame) -> pd.DataFrame:
    """Converts Spark DataFrame to Pandas, formats it, and writes to GCS."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append(
            {"quadrat_id": row["quadrat_id"], "species_ids": formatted_species}
        )

    pandas_df = pd.DataFrame(records)
    return pandas_df


def get_plantclef_dir() -> str:
    home_dir = Path(os.path.expanduser("~"))
    return f"{home_dir}/p-dsgt_clef2025-0/shared/plantclef/"


def write_csv_to_pace(df, file_name: str, testset_name: str):
    """Writes the Pandas DataFrame to a CSV file on PACE."""

    # prepare and write the submission
    submission_df = prepare_and_write_submission(df)
    project_dir = get_plantclef_dir()
    submission_path = f"{project_dir}/submissions/detection/{testset_name}"
    output_path = f"{submission_path}/{file_name}"
    # ensure directory exists before saving
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # write to CSV
    submission_df.to_csv(output_path, sep=",", index=False, quoting=csv.QUOTE_ALL)
    print(f"Submission file saved to: {output_path}")


def main(df_final: pd.DataFrame, file_name: str, testset_name: str = "test_2025"):
    # write CSV file to PACE
    write_csv_to_pace(df_final, file_name, testset_name)

In [None]:
file_name = f"groundingdino_nms_topk{TOP_K}_dsgt.csv"
main(final_df, file_name, testset_name="test_2025")