# Submission to the competition

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from pathlib import Path

# Set the root directory to your home directory
root = Path(os.path.expanduser("~"))
! date

Sat Apr 12 01:37:01 AM EDT 2025


In [10]:
# import aggregation classification data
project_path = "~/p-dsgt_clef2025-0/shared/plantclef"
file_name = "agg_topk10_dsgt_run_topk_9_species_grid_6x6.csv"
submission_path = f"{project_path}/submissions/aggregation_seasons/test_2025"
input_path = f"{submission_path}/{file_name}"
sub_df = pd.read_csv(input_path)
print(f"Submission DF shape {sub_df.shape}")
sub_df.head()

Submission DF shape (2105, 2)


Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1360187, 1392662, 1395045, 1738679, 1362443, ..."
1,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
2,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
3,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
4,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."


In [8]:
# import filtered data based on geolocation of France
france_path = f"{project_path}/data/france_geodata_species.csv"
france_df = pd.read_csv(france_path)
france_df.head(10)

Unnamed: 0,species_id
0,1409031
1,1409132
2,1409146
3,1409148
4,1409180
5,1409275
6,1737420
7,1737422
8,1737424
9,1737425


In [12]:
import ast


def filter_species_by_country(
    filtered_df: pd.DataFrame,
    sub_df: pd.DataFrame,
) -> pd.DataFrame:
    sub_df = sub_df.copy()
    sub_df["species_ids"] = sub_df["species_ids"].apply(lambda x: ast.literal_eval(x))

    # get valid species_ids from joined_df
    valid_species_ids = filtered_df["species_id"].tolist()

    # filter each list of species_ids insub_df
    sub_df["species_ids"] = sub_df["species_ids"].apply(
        lambda id_list: [
            species_id for species_id in id_list if species_id in valid_species_ids
        ]
    )
    return sub_df


final_df = filter_species_by_country(france_df, sub_df)
print(f"Filtered DF shape {final_df.shape}")
final_df.head()

Filtered DF shape (2105, 2)


Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1360187, 1392662, 1395045, 1738679, 1362443, ..."
1,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
2,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
3,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
4,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."


In [14]:
def analyze_species_filtering(original_df: pd.DataFrame, filtered_df: pd.DataFrame):
    # Original species counts per row
    original_df = original_df.copy()
    original_df["species_ids"] = original_df["species_ids"].apply(
        lambda x: ast.literal_eval(x)
    )
    original_counts = original_df["species_ids"].apply(len)

    # Filtered species counts per row
    filtered_counts = filtered_df["species_ids"].apply(len)

    print("Species Count Per Quadrat (Before Filtering):")
    print(f"Total Species IDs: {original_counts.sum()}")
    print(f"Mean: {original_counts.mean():.2f}")
    print(f"Min: {original_counts.min()}")
    print(f"Max: {original_counts.max()}")

    print("\nSpecies Count Per Quadrat (After Filtering by France):")
    print(f"Total Species IDs: {filtered_counts.sum()}")
    print(f"Mean: {filtered_counts.mean():.2f}")
    print(f"Min: {filtered_counts.min()}")
    print(f"Max: {filtered_counts.max()}")

    print("\nReduction in Species IDs:")
    print(f"Total Reduction: {original_counts.sum() - filtered_counts.sum()}")
    print(
        f"Avg Reduction per Quadrat: {(original_counts - filtered_counts).mean():.2f}"
    )

In [15]:
# Make sure agg_df has been converted using ast.literal_eval already
original_agg_df = sub_df.copy(deep=True)
filtered_agg_df = final_df.copy(deep=True)

# Run analysis
analyze_species_filtering(original_agg_df, filtered_agg_df)

Species Count Per Quadrat (Before Filtering):
Total Species IDs: 18629
Mean: 8.85
Min: 1
Max: 10

Species Count Per Quadrat (After Filtering by France):
Total Species IDs: 15618
Mean: 7.42
Min: 0
Max: 10

Reduction in Species IDs:
Total Reduction: 3011
Avg Reduction per Quadrat: 1.43


In [17]:
import csv
from pathlib import Path


def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(pandas_df: pd.DataFrame) -> pd.DataFrame:
    """Converts Spark DataFrame to Pandas, formats it, and writes to GCS."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append(
            {"quadrat_id": row["quadrat_id"], "species_ids": formatted_species}
        )

    pandas_df = pd.DataFrame(records)
    return pandas_df


def get_plantclef_dir() -> str:
    home_dir = Path(os.path.expanduser("~"))
    return f"{home_dir}/p-dsgt_clef2025-0/shared/plantclef/"


def write_csv_to_pace(df, file_name: str, testset_name: str):
    """Writes the Pandas DataFrame to a CSV file on PACE."""

    # prepare and write the submission
    submission_df = prepare_and_write_submission(df)
    project_dir = get_plantclef_dir()
    submission_path = f"{project_dir}/submissions/aggregation_seasons/{testset_name}"
    output_path = f"{submission_path}/{file_name}"
    # ensure directory exists before saving
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # write to CSV
    submission_df.to_csv(output_path, sep=",", index=False, quoting=csv.QUOTE_ALL)
    print(f"Submission file saved to: {output_path}")


def main(df_final: pd.DataFrame, folder_name: str, testset_name: str = "test_2025"):
    # define the file name
    FILE_NAME = f"dsgt_run_{folder_name}.csv"

    sub_file_name = f"france_geo_agg_topk10_{FILE_NAME}"
    write_csv_to_pace(df_final, sub_file_name, testset_name)

In [18]:
folder_name = "topk_9_species_grid_6x6"
main(filtered_agg_df, folder_name, testset_name="test_2025")

Submission file saved to: /storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/plantclef//submissions/aggregation_seasons/test_2025/france_geo_agg_topk10_dsgt_run_topk_9_species_grid_6x6.csv
