# Baseline Classification

Identifying the 10, 25, 50, 100 most common plant species. Submit them as a baseline classification strategy.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from plantclef.spark import get_spark

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/07 23:14:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/07 23:14:24 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
import os
from pathlib import Path

# Set the root directory to your home directory
root = Path(os.path.expanduser("~"))
! date

Mon Apr  7 11:14:26 PM EDT 2025


In [4]:
# path and dataset names
data_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/data/parquet"

# define the path to the train and test parquet files
train_path = f"{data_path}/train"
test_path = f"{data_path}/test_2025"

# read the parquet files into a spark DataFrame
train_df = spark.read.parquet(train_path)
test_df = spark.read.parquet(test_path)

# show the data
train_df.printSchema()
train_df.show(n=5)
test_df.printSchema()
test_df.show(n=5)

root
 |-- image_name: string (nullable = true)
 |-- path: string (nullable = true)
 |-- data: binary (nullable = true)
 |-- organ: string (nullable = true)
 |-- species_id: integer (nullable = true)
 |-- obs_id: long (nullable = true)
 |-- license: string (nullable = true)
 |-- partner: string (nullable = true)
 |-- author: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- gbif_species_id: string (nullable = true)
 |-- species: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- family: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- references: string (nullable = true)
 |-- url: string (nullable = true)
 |-- learn_tag: string (nullable = true)
 |-- image_backup_url: string (nullable = true)



                                                                                

+--------------------+--------------------+--------------------+-----+----------+----------+--------------------+-------+----------------+--------+-----------------+------------------+---------------+--------------------+-------------+----------+--------+-----------+--------------------+--------------------+---------+--------------------+
|          image_name|                path|                data|organ|species_id|    obs_id|             license|partner|          author|altitude|         latitude|         longitude|gbif_species_id|             species|        genus|    family| dataset|  publisher|          references|                 url|learn_tag|    image_backup_url|
+--------------------+--------------------+--------------------+-----+----------+----------+--------------------+-------+----------------+--------+-----------------+------------------+---------------+--------------------+-------------+----------+--------+-----------+--------------------+--------------------+---------

                                                                                

+--------------------+--------------------+--------------------+
|          image_name|                path|                data|
+--------------------+--------------------+--------------------+
|CBN-Pla-B3-201907...|/test/data/PlantC...|[FF D8 FF E0 00 1...|
|CBN-PdlC-E5-20180...|/test/data/PlantC...|[FF D8 FF E0 00 1...|
|CBN-PdlC-B1-20140...|/test/data/PlantC...|[FF D8 FF E0 00 1...|
|CBN-Pla-D4-201507...|/test/data/PlantC...|[FF D8 FF E0 00 1...|
|CBN-PdlC-F3-20190...|/test/data/PlantC...|[FF D8 FF E0 00 1...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.sql import functions as F

# Get top species by number of images
grouped_train_df = (
    train_df.groupBy(["species", "species_id"])
    .agg(F.count("species_id").alias("n"))
    .orderBy(F.col("n").desc())
)

# Action
grouped_train_df.show()



+--------------------+----------+---+
|             species|species_id|  n|
+--------------------+----------+---+
|Styphnolobium jap...|   1369068|823|
|Frangula alnus Mill.|   1360257|793|
|Aria edulis (Will...|   1737559|770|
|Lathyrus oleraceu...|   1741625|700|
|  Bromus sterilis L.|   1414366|674|
|   Lotus hirsutus L.|   1743246|668|
|Scandosorbus inte...|   1737669|662|
|Oxalis dillenii J...|   1394359|653|
|     Hedera helix L.|   1363575|653|
|Calicotome spinos...|   1359160|642|
|Castanea sativa M...|   1358610|638|
|Buxus semperviren...|   1356729|636|
|    Arbutus unedo L.|   1358500|634|
|Pistacia lentiscu...|   1356290|632|
|   Ulmus minor Mill.|   1360607|630|
|Tecomaria capensi...|   1400151|627|
|     Quercus ilex L.|   1357416|626|
|Calamagrostis are...|   1722433|625|
|Pittosporum tobir...|   1394624|625|
|Cercis siliquastr...|   1359162|624|
+--------------------+----------+---+
only showing top 20 rows



                                                                                

In [6]:
def get_top_species_ids(train_df, top_k: int = 10):
    """
    Get the top K species IDs from the training DataFrame.

    Args:
        train_df (DataFrame): The training DataFrame.
        K (int): The number of top species to select.

    Returns:
        list: A list of top K species IDs.
    """
    # Group by species and count the number of images
    grouped_train_df = (
        train_df.groupBy(["species", "species_id"])
        .agg(F.count("species_id").alias("n"))
        .orderBy(F.col("n").desc())
    )

    # Select top K species into list
    subset_species = grouped_train_df.select("species_id").limit(top_k).collect()
    return [row["species_id"] for row in subset_species]


# Get top-K species IDs
top_k = 10
top_species_list = get_top_species_ids(train_df, top_k=top_k)
print(f"Top {top_k} species: {top_species_list}")



Top 10 species: [1369068, 1360257, 1737559, 1741625, 1414366, 1743246, 1737669, 1394359, 1363575, 1359160]


                                                                                

In [7]:
# top 25 species
top_k = 25
top_species_list = get_top_species_ids(train_df, top_k=top_k)
print(f"Top {top_k} species: {top_species_list}")



Top 25 species: [1369068, 1360257, 1737559, 1741625, 1414366, 1743246, 1737669, 1394359, 1363575, 1359160, 1358610, 1356729, 1358500, 1356290, 1360607, 1400151, 1357416, 1722433, 1394624, 1359162, 1391375, 1363988, 1359676, 1356308, 1737508]


                                                                                

In [8]:
# select image names from test DataFrame
image_names_df = test_df.select("image_name").orderBy(F.col("image_name"))
image_names = [row["image_name"] for row in image_names_df.collect()]
image_names[:10]

                                                                                

['2024-CEV3-20240602.jpg',
 'CBN-PdlC-A1-20130807.jpg',
 'CBN-PdlC-A1-20130903.jpg',
 'CBN-PdlC-A1-20140721.jpg',
 'CBN-PdlC-A1-20140811.jpg',
 'CBN-PdlC-A1-20140901.jpg',
 'CBN-PdlC-A1-20150701.jpg',
 'CBN-PdlC-A1-20150720.jpg',
 'CBN-PdlC-A1-20150831.jpg',
 'CBN-PdlC-A1-20160705.jpg']

In [9]:
import pandas as pd

# create pandas DataFrame with image names
pandas_dict = {
    "image_name": image_names,
    "species_ids": [top_species_list] * len(image_names),
}
pandas_df = pd.DataFrame(pandas_dict, columns=["image_name", "species_ids"])
pandas_df.head(10)

Unnamed: 0,image_name,species_ids
0,2024-CEV3-20240602.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
1,CBN-PdlC-A1-20130807.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2,CBN-PdlC-A1-20130903.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
3,CBN-PdlC-A1-20140721.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
4,CBN-PdlC-A1-20140811.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
5,CBN-PdlC-A1-20140901.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
6,CBN-PdlC-A1-20150701.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
7,CBN-PdlC-A1-20150720.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
8,CBN-PdlC-A1-20150831.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
9,CBN-PdlC-A1-20160705.jpg,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."


In [28]:
def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(
    pandas_df: pd.DataFrame, col: str = "image_name"
) -> pd.DataFrame:
    """Formats the Pandas DataFrame, and writes to PACE."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append({"quadrat_id": row[col], "species_ids": formatted_species})

    pandas_df = pd.DataFrame(records)
    # remove .jpg from quadrat_id in final_df
    pandas_df["quadrat_id"] = pandas_df["quadrat_id"].str.replace(
        ".jpg", "", regex=False
    )

    return pandas_df


final_df = prepare_and_write_submission(pandas_df)
final_df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
1,CBN-PdlC-A1-20130807,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2,CBN-PdlC-A1-20130903,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
3,CBN-PdlC-A1-20140721,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
4,CBN-PdlC-A1-20140811,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
5,CBN-PdlC-A1-20140901,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
6,CBN-PdlC-A1-20150701,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
7,CBN-PdlC-A1-20150720,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
8,CBN-PdlC-A1-20150831,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
9,CBN-PdlC-A1-20160705,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."


In [29]:
len(final_df)

2105

In [30]:
# Get top-K species IDs
top_k = 100
top_species_list = get_top_species_ids(train_df, top_k=top_k)
print(f"Top {top_k} species: {top_species_list}")



Top 100 species: [1369068, 1360257, 1737559, 1741625, 1414366, 1743246, 1737669, 1394359, 1363575, 1359160, 1358610, 1356729, 1358500, 1356290, 1360607, 1400151, 1357416, 1722433, 1394624, 1359162, 1391375, 1363988, 1359676, 1356308, 1737508, 1391368, 1356793, 1356609, 1356455, 1390952, 1363698, 1743245, 1397356, 1364046, 1358543, 1359616, 1363362, 1426015, 1395026, 1360449, 1722630, 1362009, 1668240, 1363142, 1356458, 1360547, 1359677, 1356576, 1363832, 1412585, 1396503, 1400126, 1388787, 1356208, 1363813, 1737588, 1397383, 1356567, 1361743, 1358613, 1361864, 1394952, 1361710, 1356608, 1360323, 1397307, 1391354, 1389550, 1359658, 1395193, 1362963, 1396802, 1358713, 1414356, 1359675, 1391434, 1738846, 1360324, 1741903, 1360262, 1743242, 1365253, 1363687, 1356457, 1743352, 1393365, 1391958, 1360260, 1360316, 1356792, 1357413, 1392236, 1360290, 1356460, 1360326, 1363624, 1722522, 1397301, 1393639, 1396797]


                                                                                

### compare with submission CSV file

In [31]:
# path and dataset names
submission_path = f"{root}/p-dsgt_clef2025-0/shared/plantclef/submissions"

# define the path to the train and test parquet files
submission_df = pd.read_csv(f"{submission_path}/naive_baseline/submission.csv")
submission_df.head(10)

Unnamed: 0,quadrat_id,species_ids
0,CBN-PdlC-E3-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
1,CBN-PdlC-E2-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2,CBN-PdlC-E5-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
3,CBN-PdlC-E6-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
4,CBN-PdlC-E1-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
5,CBN-PdlC-F1-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
6,CBN-PdlC-F2-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
7,CBN-PdlC-F3-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
8,CBN-PdlC-F4-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
9,CBN-PdlC-F5-20130723,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."


In [32]:
# sort the DataFrame by 'quadrat_id'
submission_df = submission_df.sort_values(by="quadrat_id")
submission_df.head(10)

Unnamed: 0,quadrat_id,species_ids
2096,2024-CEV3-20240602,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
42,CBN-PdlC-A1-20130807,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
112,CBN-PdlC-A1-20130903,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
180,CBN-PdlC-A1-20140721,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
259,CBN-PdlC-A1-20140811,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
312,CBN-PdlC-A1-20140901,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
377,CBN-PdlC-A1-20150701,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
445,CBN-PdlC-A1-20150720,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
560,CBN-PdlC-A1-20150831,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."
649,CBN-PdlC-A1-20160705,"[1369068, 1360257, 1737559, 1741625, 1414366, ..."


In [33]:
# compare the two DataFrames
comparison_df = pd.merge(
    final_df,
    submission_df,
    on="quadrat_id",
    how="outer",
    suffixes=("_new", "_old"),
)
comparison_df.tail(10)

Unnamed: 0,quadrat_id,species_ids_new,species_ids_old
2095,RNNB-8-1-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2096,RNNB-8-10-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2097,RNNB-8-2-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2098,RNNB-8-3-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2099,RNNB-8-4-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2100,RNNB-8-5-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2101,RNNB-8-6-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2102,RNNB-8-7-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2103,RNNB-8-8-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
2104,RNNB-8-9-20240118,"[1369068, 1360257, 1737559, 1741625, 1414366, ...","[1369068, 1360257, 1737559, 1741625, 1414366, ..."
