In [1]:
import pandas as pd

submission_path = "~/p-dsgt_clef2025-0/shared/plantclef/submissions"
file_name = "topk_9_species_grid_6x6/dsgt_run_topk_9_species_grid_6x6.csv"
# read CSV file
df = pd.read_csv(f"{submission_path}/{file_name}", delimiter=";")
df["species_ids"] = df["species_ids"]
df.head(10)

Unnamed: 0,plot_id,species_ids
0,CBN-PdlC-A1-20130807,"[1392407, 1395807, 1392611, 1392535]"
1,CBN-PdlC-A1-20130903,"[1392608, 1397468, 1622901, 1742052, 1392407, ..."
2,CBN-PdlC-A1-20140721,"[1412857, 1392608, 1392407, 1392535]"
3,CBN-PdlC-A1-20140811,"[1392608, 1412857, 1395807, 1388793]"
4,CBN-PdlC-A1-20140901,"[1742052, 1392608, 1392407, 1396144]"
5,CBN-PdlC-A1-20150701,"[1392608, 1396144, 1397535, 1742052, 1392565, ..."
6,CBN-PdlC-A1-20150720,"[1392608, 1412857, 1394540, 1742052, 1392407, ..."
7,CBN-PdlC-A1-20150831,"[1742052, 1392608, 1412857]"
8,CBN-PdlC-A1-20160705,"[1397535, 1392608, 1412857, 1394911, 1359921]"
9,CBN-PdlC-A1-20160726,"[1549015, 1742052, 1412857, 1394908, 1392608]"


In [2]:
import re


# function to extract the base plot_id without date
def extract_base_plot_id(plot_id):
    patterns = [
        r"^(CBN-[A-Za-z]+-[A-Z]\d+)-\d{8}$",  # CBN pattern
        r"^(OPTMix-\d{3}-P\d+)-\d+$",  # OPTMix pattern
        r"^(RNNB-\d+-\d+)-\d+$",  # RNNB pattern
    ]

    for pattern in patterns:
        match = re.match(pattern, plot_id)
        if match:
            return match.group(1)

    return plot_id  # if no match, return as is

In [3]:
# apply the function to extract base plot_id
df["base_plot_id"] = df["plot_id"].apply(extract_base_plot_id)
df.head()

Unnamed: 0,plot_id,species_ids,base_plot_id
0,CBN-PdlC-A1-20130807,"[1392407, 1395807, 1392611, 1392535]",CBN-PdlC-A1
1,CBN-PdlC-A1-20130903,"[1392608, 1397468, 1622901, 1742052, 1392407, ...",CBN-PdlC-A1
2,CBN-PdlC-A1-20140721,"[1412857, 1392608, 1392407, 1392535]",CBN-PdlC-A1
3,CBN-PdlC-A1-20140811,"[1392608, 1412857, 1395807, 1388793]",CBN-PdlC-A1
4,CBN-PdlC-A1-20140901,"[1742052, 1392608, 1392407, 1396144]",CBN-PdlC-A1


In [4]:
# convert species_ids column from string representation to list of integers
df["species_ids"] = df["species_ids"].apply(
    lambda x: eval(x) if isinstance(x, str) else x
)

# ensure all elements in the list are integers
df["species_ids"] = df["species_ids"].apply(lambda x: list(map(int, x)))
df.head()

Unnamed: 0,plot_id,species_ids,base_plot_id
0,CBN-PdlC-A1-20130807,"[1392407, 1395807, 1392611, 1392535]",CBN-PdlC-A1
1,CBN-PdlC-A1-20130903,"[1392608, 1397468, 1622901, 1742052, 1392407, ...",CBN-PdlC-A1
2,CBN-PdlC-A1-20140721,"[1412857, 1392608, 1392407, 1392535]",CBN-PdlC-A1
3,CBN-PdlC-A1-20140811,"[1392608, 1412857, 1395807, 1388793]",CBN-PdlC-A1
4,CBN-PdlC-A1-20140901,"[1742052, 1392608, 1392407, 1396144]",CBN-PdlC-A1


In [5]:
from collections import Counter


def union_agg(x):
    return list(set([item for sublist in x for item in sublist]))


# Function to aggregate species and sort them by frequency
def union_agg_sorted(x):
    species_counts = Counter([species for sublist in x for species in sublist])
    return [species for species, _ in species_counts.most_common()]


df_union = (
    df.groupby("base_plot_id")["species_ids"].apply(union_agg_sorted).reset_index()
)
df_union.head()

Unnamed: 0,base_plot_id,species_ids
0,CBN-PdlC-A1,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
1,CBN-PdlC-A2,"[1395974, 1412857, 1742052, 1395807, 1392535, ..."
2,CBN-PdlC-A3,"[1412857, 1392608, 1742052, 1397535, 1392407, ..."
3,CBN-PdlC-A4,"[1392608, 1393200, 1741661, 1392611, 1360591, ..."
4,CBN-PdlC-A5,"[1395807, 1397475, 1396253, 1397535, 1392407, ..."


In [6]:
# Merge the aggregated species_ids from df_union back to the original df using base_plot_id
df_merged = df.merge(
    df_union, on="base_plot_id", how="left", suffixes=("_original", "_aggregated")
)

# Select only the required columns: original plot_id and aggregated species_ids
df_final = df_merged[["plot_id", "species_ids_aggregated"]].rename(
    columns={"species_ids_aggregated": "species_ids"}
)
df_final.head(10)

Unnamed: 0,plot_id,species_ids
0,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
1,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
2,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
3,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
4,CBN-PdlC-A1-20140901,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
5,CBN-PdlC-A1-20150701,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
6,CBN-PdlC-A1-20150720,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
7,CBN-PdlC-A1-20150831,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
8,CBN-PdlC-A1-20160705,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
9,CBN-PdlC-A1-20160726,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."


In [7]:
import os
import csv
from pathlib import Path


def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(pandas_df: pd.DataFrame) -> pd.DataFrame:
    """Converts Spark DataFrame to Pandas, formats it, and writes to GCS."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append({"plot_id": row["plot_id"], "species_ids": formatted_species})

    pandas_df = pd.DataFrame(records)
    return pandas_df


def get_plantclef_dir() -> str:
    home_dir = Path(os.path.expanduser("~"))
    return f"{home_dir}/p-dsgt_clef2025-0/shared/plantclef/"


def write_csv_to_pace(df, file_name: str):
    """Writes the Pandas DataFrame to a CSV file in GCS."""
    project_dir = get_plantclef_dir()
    submission_path = f"{project_dir}/submissions"
    folder_name = "aggregation_seasons"
    output_path = f"{submission_path}/{folder_name}/{file_name}"

    # ensure directory exists before saving
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # write to CSV
    df.to_csv(output_path, sep=";", index=False, quoting=csv.QUOTE_NONE)


def main(df, file_name):
    # return dataframe for submission
    pandas_df = prepare_and_write_submission(df)
    display(pandas_df.head())
    # write dataframe to PACE
    write_csv_to_pace(pandas_df, file_name)

### normal aggregation

In [8]:
file_name = "agg_dsgt_run_topk_9_species_grid_6x6.csv"
main(df_final, file_name)

Unnamed: 0,plot_id,species_ids
0,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
1,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
2,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
3,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
4,CBN-PdlC-A1-20140901,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."


In [9]:
# function to aggregate species, sort them by frequency, and select top K species
def union_agg_topk(x, top_k=5):
    species_counts = Counter([species for sublist in x for species in sublist])
    return [species for species, _ in species_counts.most_common(top_k)]


# Define the top K value (change as needed)
top_k = 5

# group by base_plot_id and apply the updated aggregation function with top K filtering
df_union_topk = (
    df.groupby("base_plot_id")["species_ids"]
    .apply(lambda x: union_agg_topk(x, top_k))
    .reset_index()
)
display(df_union_topk.head(3))

# merge the aggregated species_ids back to the original df
df_merged_topk = df.merge(
    df_union_topk, on="base_plot_id", how="left", suffixes=("_original", "_aggregated")
)

# select only the required columns: original plot_id and aggregated species_ids
df_final_topk = df_merged_topk[["plot_id", "species_ids_aggregated"]].rename(
    columns={"species_ids_aggregated": "species_ids"}
)
df_final_topk.head()

Unnamed: 0,base_plot_id,species_ids
0,CBN-PdlC-A1,"[1392608, 1742052, 1412857, 1392407, 1392611]"
1,CBN-PdlC-A2,"[1395974, 1412857, 1742052, 1395807, 1392535]"
2,CBN-PdlC-A3,"[1412857, 1392608, 1742052, 1397535, 1392407]"


Unnamed: 0,plot_id,species_ids
0,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611]"
1,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611]"
2,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611]"
3,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611]"
4,CBN-PdlC-A1-20140901,"[1392608, 1742052, 1412857, 1392407, 1392611]"


### top K aggregation

In [10]:
file_name = f"agg_topk{top_k}_dsgt_run_topk_9_species_grid_6x6.csv"
main(df_final_topk, file_name)

Unnamed: 0,plot_id,species_ids
0,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611]"
1,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611]"
2,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611]"
3,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611]"
4,CBN-PdlC-A1-20140901,"[1392608, 1742052, 1412857, 1392407, 1392611]"


In [11]:
display(df_final_topk)

Unnamed: 0,plot_id,species_ids
0,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611]"
1,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611]"
2,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611]"
3,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611]"
4,CBN-PdlC-A1-20140901,"[1392608, 1742052, 1412857, 1392407, 1392611]"
...,...,...
1690,RNNB-8-5-20240118,"[1359344, 1357700, 1390761, 1363302]"
1691,RNNB-8-6-20240118,"[1359344, 1361703, 1359297]"
1692,RNNB-8-7-20240118,"[1359344, 1357227]"
1693,RNNB-8-8-20240118,"[1359344, 1357227]"
