# Baseline Classification

Identifying the top-K probabilites over each test data. Submit them as a baseline classification strategy for the ViT.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from plantclef.spark import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/19 22:20:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/19 22:20:53 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/04/19 22:20:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/19 22:20:54 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/04/19 22:20:54 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
import os
from pathlib import Path

# Set the root directory to your home directory
root = Path(os.path.expanduser("~"))
! date

Sat Apr 19 10:20:56 PM EDT 2025


In [4]:
# path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data/embeddings/test_2025"

# define the path to the train and test parquet files
test_path = f"{data_path}/test_2025_embed_logits"

# read the parquet files into a spark DataFrame
test_df = spark.read.parquet(test_path)

# show the data
test_df.printSchema()
test_df.show(n=5)

                                                                                

root
 |-- image_name: string (nullable = true)
 |-- output: struct (nullable = true)
 |    |-- cls_token: array (nullable = true)
 |    |    |-- element: float (containsNull = true)
 |    |-- logits: array (nullable = true)
 |    |    |-- element: float (containsNull = true)
 |-- sample_id: integer (nullable = true)

+--------------------+--------------------+---------+
|          image_name|              output|sample_id|
+--------------------+--------------------+---------+
|CBN-Pla-A1-201908...|{[0.47354543, 1.5...|        0|
|CBN-Pla-D6-201908...|{[-0.39621377, 1....|        0|
|CBN-PdlC-C5-20140...|{[-0.5331654, 0.2...|        0|
|LISAH-BOU-0-37-20...|{[1.2480925, 0.47...|        0|
|CBN-Pla-E4-201308...|{[0.7065191, 1.70...|        0|
+--------------------+--------------------+---------+
only showing top 5 rows



In [5]:
import torch
from plantclef.config import get_class_mappings_file


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def load_class_mapping(class_mapping_file):
    with open(class_mapping_file) as f:
        return [int(line.strip()) for line in f]


class_mapping_file = get_class_mappings_file()
sorted_species_ids = load_class_mapping(class_mapping_file)


def top_k_species(probabilities, top_k: int = 9):
    probs_tensor = torch.tensor(probabilities, dtype=torch.float32).to(device)
    probabilities = torch.softmax(probs_tensor, dim=0)
    top_probs, top_indices = torch.topk(probabilities, k=top_k)
    top_probs = top_probs.cpu().numpy()
    top_indices = top_indices.cpu().numpy()
    return [sorted_species_ids[i] for i in top_indices]


# apply top-k per row
top_k_proba = 9
COL_NAME = "cls_token"  # "probabilities"

test_pd = test_df.select("image_name", "output.cls_token").toPandas()
test_pd["species_ids"] = test_pd[COL_NAME].apply(
    lambda probs: top_k_species(probs, top_k=top_k_proba)
)
test_pd.head(5)

                                                                                

Unnamed: 0,image_name,cls_token,species_ids
0,CBN-Pla-A1-20190814.jpg,"[0.47354543, 1.5568701, -1.6330245, -1.3648611...","[1357904, 1357986, 1358259, 1357379, 1355903, ..."
1,CBN-Pla-D6-20190814.jpg,"[-0.39621377, 1.2026826, 0.27647698, -0.661421...","[1355898, 1357904, 1357061, 1358012, 1356619, ..."
2,CBN-PdlC-C5-20140901.jpg,"[-0.5331654, 0.21328913, -1.2809799, 0.1238243...","[1356307, 1357117, 1356568, 1357061, 1356830, ..."
3,LISAH-BOU-0-37-20230512.jpg,"[1.2480925, 0.4781976, 0.69301766, 0.4653994, ...","[1358164, 1356591, 1358203, 1356431, 1357988, ..."
4,CBN-Pla-E4-20130808.jpg,"[0.7065191, 1.7097996, -1.2477401, 1.3419615, ...","[1357117, 1357151, 1357904, 1357394, 1357446, ..."


In [6]:
len(test_pd["species_ids"].iloc[0])

9

In [7]:
import pandas as pd


def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(
    pandas_df: pd.DataFrame, col: str = "image_name"
) -> pd.DataFrame:
    """Formats the Pandas DataFrame, and writes to PACE."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append({"quadrat_id": row[col], "species_ids": formatted_species})

    pandas_df = pd.DataFrame(records)
    # remove .jpg from quadrat_id in final_df
    pandas_df["quadrat_id"] = pandas_df["quadrat_id"].str.replace(
        ".jpg", "", regex=False
    )

    return pandas_df


final_df = prepare_and_write_submission(test_pd)
final_df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,CBN-Pla-A1-20190814,"[1357904, 1357986, 1358259, 1357379, 1355903, ..."
1,CBN-Pla-D6-20190814,"[1355898, 1357904, 1357061, 1358012, 1356619, ..."
2,CBN-PdlC-C5-20140901,"[1356307, 1357117, 1356568, 1357061, 1356830, ..."
3,LISAH-BOU-0-37-20230512,"[1358164, 1356591, 1358203, 1356431, 1357988, ..."
4,CBN-Pla-E4-20130808,"[1357117, 1357151, 1357904, 1357394, 1357446, ..."
5,CBN-PdlC-D6-20150701,"[1357450, 1358012, 1356747, 1357198, 1357117, ..."
6,CBN-PdlC-F2-20170906,"[1357809, 1357553, 1356306, 1356033, 1357335, ..."
7,CBN-PdlC-A6-20180905,"[1356133, 1356634, 1356405, 1356528, 1357182, ..."
8,RNNB-3-12-20230512,"[1355871, 1357070, 1356069, 1358112, 1357508, ..."
9,CBN-PdlC-F4-20150810,"[1357799, 1357117, 1357646, 1356552, 1357151, ..."


In [8]:
len(final_df), final_df.shape, final_df.columns

(2105, (2105, 2), Index(['quadrat_id', 'species_ids'], dtype='object'))

In [9]:
import csv


def get_plantclef_dir() -> str:
    home_dir = Path(os.path.expanduser("~"))
    return f"{home_dir}/p-dsgt_clef2025-0/shared/plantclef"


def write_csv_to_pace(df, file_name: str):
    """Writes the Pandas DataFrame to a CSV file on PACE."""

    # prepare and write the submission
    submission_df = prepare_and_write_submission(df)
    project_dir = get_plantclef_dir()
    submission_path = f"{project_dir}/submissions/baseline_classification"
    output_path = f"{submission_path}/{file_name}"
    # ensure directory exists before saving
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # sort by quadrat_id
    submission_df = submission_df.sort_values(by=["quadrat_id"])
    # write to CSV
    submission_df.to_csv(output_path, sep=",", index=False, quoting=csv.QUOTE_ALL)
    print(f"Submission file saved to: {output_path}")


file_name = f"dsgt_baseline_topk{top_k_proba}_per_image.csv"
write_csv_to_pace(test_pd, file_name)

Submission file saved to: /storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/plantclef/submissions/baseline_classification/dsgt_baseline_topk9_per_image.csv


In [10]:
submission_path = f"~/p-dsgt_clef2025-0/shared/plantclef/submissions/baseline_classification/{file_name}"
df = pd.read_csv(submission_path)
df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1357553, 1355972, 1356373, 1356356, 1358132, ..."
1,CBN-PdlC-A1-20130807,"[1357441, 1357799, 1356576, 1357117, 1357646, ..."
2,CBN-PdlC-A1-20130903,"[1357061, 1357117, 1357441, 1356609, 1358062, ..."
3,CBN-PdlC-A1-20140721,"[1357799, 1357117, 1356985, 1356635, 1358012, ..."
4,CBN-PdlC-A1-20140811,"[1357799, 1357441, 1357117, 1356635, 1358140, ..."
5,CBN-PdlC-A1-20140901,"[1357117, 1356308, 1358132, 1357437, 1357441, ..."
6,CBN-PdlC-A1-20150701,"[1357799, 1357117, 1357441, 1356804, 1357198, ..."
7,CBN-PdlC-A1-20150720,"[1357799, 1356356, 1357437, 1357117, 1357646, ..."
8,CBN-PdlC-A1-20150831,"[1357437, 1357441, 1355937, 1358105, 1357117, ..."
9,CBN-PdlC-A1-20160705,"[1357799, 1357394, 1357441, 1356635, 1356306, ..."


In [11]:
# convert species_ids to list of integers
def convert_species_ids_to_list(species_ids: str) -> list:
    # remove brackets and split by comma
    species_ids = species_ids.strip("[]").split(", ")
    # convert to list of integers
    return [int(id) for id in species_ids]


df["species_ids"] = df["species_ids"].apply(convert_species_ids_to_list)

In [12]:
# get average for number of species IDs
avg_species_ids = df["species_ids"].apply(len).mean()
avg_species_ids

9.0