# Baseline Classification

Identifying the top 20 probabilites over all test data. Submit them as a baseline classification strategy.

In [3]:
%load_ext autoreload
%autoreload 2

In [1]:
from plantclef.spark import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/16 23:30:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/16 23:30:46 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/04/16 23:30:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
import os
from pathlib import Path

# Set the root directory to your home directory
root = Path(os.path.expanduser("~"))
! date

Wed Apr 16 11:30:49 PM EDT 2025


In [4]:
# path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data/embeddings/test_2025"

# define the path to the train and test parquet files
test_path = f"{data_path}/test_2025_embed_logits"

# read the parquet files into a spark DataFrame
test_df = spark.read.parquet(test_path)

# show the data
test_df.printSchema()
test_df.show(n=5)

                                                                                

root
 |-- image_name: string (nullable = true)
 |-- output: struct (nullable = true)
 |    |-- cls_token: array (nullable = true)
 |    |    |-- element: float (containsNull = true)
 |    |-- logits: array (nullable = true)
 |    |    |-- element: float (containsNull = true)
 |-- sample_id: integer (nullable = true)



                                                                                

+--------------------+--------------------+---------+
|          image_name|              output|sample_id|
+--------------------+--------------------+---------+
|CBN-Pla-A1-201908...|{[0.47354543, 1.5...|        0|
|CBN-Pla-D6-201908...|{[-0.39621377, 1....|        0|
|CBN-PdlC-C5-20140...|{[-0.5331654, 0.2...|        0|
|LISAH-BOU-0-37-20...|{[1.2480925, 0.47...|        0|
|CBN-Pla-E4-201308...|{[0.7065191, 1.70...|        0|
+--------------------+--------------------+---------+
only showing top 5 rows



In [12]:
import torch
import numpy as np

# average the logits over all images
logits = test_df.select("output.logits").rdd.flatMap(lambda x: x).collect()
logits = [list(map(float, x)) for x in logits]
probabilities = torch.softmax(torch.tensor(logits), dim=1).numpy()
probabilities = np.mean(logits, axis=0)
len(probabilities), probabilities.shape, probabilities[:5], probabilities[-5:]

                                                                                

(7806,
 (7806,),
 array([-0.37915118,  0.0137584 , -0.2863231 , -0.76206058, -0.2323433 ]),
 array([-0.21083864, -0.46454166, -0.31728752,  0.14438025,  0.36815306]))

In [13]:
import torch
from plantclef.config import get_class_mappings_file


def load_class_mapping(class_mapping_file):
    with open(class_mapping_file) as f:
        class_index_to_class_name = {i: line.strip() for i, line in enumerate(f)}
    return class_index_to_class_name


class_mapping_file = get_class_mappings_file()
cid_to_spid = load_class_mapping(class_mapping_file)


# map probabilities to species IDs
topk_proba = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
probabilities = torch.tensor(probabilities).to(device)
top_probs, top_indices = torch.topk(probabilities, k=topk_proba)
top_probs = top_probs.cpu().detach().numpy()
top_indices = top_indices.cpu().detach().numpy()
result = [
    {cid_to_spid[int(index)]: float(prob)}
    for index, prob in zip(top_indices, top_probs)
]
result

[{'1361281': 4.56362702188186},
 {'1395807': 3.46720762469777},
 {'1741903': 3.4471948069093346},
 {'1392323': 3.316042777634007},
 {'1741880': 3.3124565287999754},
 {'1397535': 3.2924471835130182},
 {'1396717': 3.1959532355964893},
 {'1722501': 3.18848012438013},
 {'1390764': 3.1745123443085337},
 {'1397070': 3.1173199642936296},
 {'1392558': 3.0934792359304257},
 {'1418211': 3.0826037090209772},
 {'1388766': 2.9016784266947},
 {'1392407': 2.899715812093713},
 {'1394630': 2.829894659085965},
 {'1398779': 2.8217325143533762},
 {'1393824': 2.812430884726257},
 {'1358733': 2.793219553238564},
 {'1361275': 2.781753786895734},
 {'1394999': 2.7050276072248995}]

In [18]:
top_species_ids = [list(x.keys())[0] for x in result]
top_species_ids[:5]

['1361281', '1395807', '1741903', '1392323', '1741880']

In [17]:
from pyspark.sql import functions as F

# select image names from test DataFrame
image_names_df = test_df.select("image_name").orderBy(F.col("image_name"))
image_names = [row["image_name"] for row in image_names_df.collect()]
image_names[:10]

['2024-CEV3-20240602.jpg',
 'CBN-PdlC-A1-20130807.jpg',
 'CBN-PdlC-A1-20130903.jpg',
 'CBN-PdlC-A1-20140721.jpg',
 'CBN-PdlC-A1-20140811.jpg',
 'CBN-PdlC-A1-20140901.jpg',
 'CBN-PdlC-A1-20150701.jpg',
 'CBN-PdlC-A1-20150720.jpg',
 'CBN-PdlC-A1-20150831.jpg',
 'CBN-PdlC-A1-20160705.jpg']

In [19]:
import pandas as pd

# create pandas DataFrame with image names
pandas_dict = {
    "image_name": image_names,
    "species_ids": [top_species_ids] * len(image_names),
}
pandas_df = pd.DataFrame(pandas_dict, columns=["image_name", "species_ids"])
pandas_df.head(10)

Unnamed: 0,image_name,species_ids
0,2024-CEV3-20240602.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
1,CBN-PdlC-A1-20130807.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
2,CBN-PdlC-A1-20130903.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
3,CBN-PdlC-A1-20140721.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
4,CBN-PdlC-A1-20140811.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
5,CBN-PdlC-A1-20140901.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
6,CBN-PdlC-A1-20150701.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
7,CBN-PdlC-A1-20150720.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
8,CBN-PdlC-A1-20150831.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
9,CBN-PdlC-A1-20160705.jpg,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."


In [20]:
def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(
    pandas_df: pd.DataFrame, col: str = "image_name"
) -> pd.DataFrame:
    """Formats the Pandas DataFrame, and writes to PACE."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append({"quadrat_id": row[col], "species_ids": formatted_species})

    pandas_df = pd.DataFrame(records)
    # remove .jpg from quadrat_id in final_df
    pandas_df["quadrat_id"] = pandas_df["quadrat_id"].str.replace(
        ".jpg", "", regex=False
    )

    return pandas_df


final_df = prepare_and_write_submission(pandas_df)
final_df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
1,CBN-PdlC-A1-20130807,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
2,CBN-PdlC-A1-20130903,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
3,CBN-PdlC-A1-20140721,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
4,CBN-PdlC-A1-20140811,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
5,CBN-PdlC-A1-20140901,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
6,CBN-PdlC-A1-20150701,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
7,CBN-PdlC-A1-20150720,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
8,CBN-PdlC-A1-20150831,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."
9,CBN-PdlC-A1-20160705,"[1361281, 1395807, 1741903, 1392323, 1741880, ..."


In [21]:
len(final_df), final_df.shape, final_df.columns

(2105, (2105, 2), Index(['quadrat_id', 'species_ids'], dtype='object'))

In [23]:
import csv


def get_plantclef_dir() -> str:
    home_dir = Path(os.path.expanduser("~"))
    return f"{home_dir}/p-dsgt_clef2025-0/shared/plantclef"


def write_csv_to_pace(df, file_name: str):
    """Writes the Pandas DataFrame to a CSV file on PACE."""

    # prepare and write the submission
    submission_df = prepare_and_write_submission(df)
    project_dir = get_plantclef_dir()
    submission_path = f"{project_dir}/submissions/baseline_classification"
    output_path = f"{submission_path}/{file_name}"
    # ensure directory exists before saving
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # write to CSV
    submission_df.to_csv(output_path, sep=",", index=False, quoting=csv.QUOTE_ALL)
    print(f"Submission file saved to: {output_path}")


file_name = f"dsgt_baseline_topk{topk_proba}.csv"
write_csv_to_pace(pandas_df, file_name)

Submission file saved to: /storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/plantclef/submissions/baseline_classification/dsgt_baseline_topk20.csv


In [33]:
submission_name = "dsgt_run_topk_20_species_grid_4x4.csv"
path = "~/p-dsgt_clef2025-0/shared/plantclef/submissions/test_2025/topk_20_grid_4x4"
df = pd.read_csv(f"{path}/{submission_name}")
df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1362443, 1400181, 1392662, 1738679, 1360187, ..."
1,CBN-PdlC-A1-20130807,"[1392407, 1392608, 1392611, 1395807, 1741624, ..."
2,CBN-PdlC-A1-20130903,"[1395807, 1742052, 1362271, 1412857, 1397535, ..."
3,CBN-PdlC-A1-20140721,"[1412857, 1395807, 1396144, 1397535, 1392608, ..."
4,CBN-PdlC-A1-20140811,"[1395807, 1412857, 1392608, 1519650, 1742052, ..."
5,CBN-PdlC-A1-20140901,"[1361281, 1392608, 1742052, 1412857, 1391331, ..."
6,CBN-PdlC-A1-20150701,"[1392608, 1392535, 1412857, 1392407, 1394911, ..."
7,CBN-PdlC-A1-20150720,"[1392608, 1412857, 1361281, 1394554, 1395807, ..."
8,CBN-PdlC-A1-20150831,"[1742052, 1412857, 1392608, 1519650, 1392611, ..."
9,CBN-PdlC-A1-20160705,"[1412857, 1361281, 1394911, 1392608, 1392611, ..."


In [34]:
# convert species_ids to list of integers
def convert_species_ids_to_list(species_ids: str) -> list:
    # remove brackets and split by comma
    species_ids = species_ids.strip("[]").split(", ")
    # convert to list of integers
    return [int(id) for id in species_ids]


df["species_ids"] = df["species_ids"].apply(convert_species_ids_to_list)

In [35]:
# get average for number of species IDs
avg_species_ids = df["species_ids"].apply(len).mean()
avg_species_ids

10.319239904988123