# GBIF classification using JSON

In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import os
import json
import pandas as pd


def load_species_country_counts_df(
    base_path: str,
    folder_name: str = "country_counts",
) -> pd.DataFrame:
    """
    Reads JSON files with species and their occurrence counts per country,
    and returns a DataFrame with one row per (species, country, count).
    """
    json_dir = os.path.expanduser(f"{base_path}/data/genai/02_gbif/{folder_name}")
    data = []

    for filename in os.listdir(json_dir):
        if filename.endswith(".json"):
            filepath = os.path.join(json_dir, filename)
            with open(filepath, "r") as f:
                content = json.load(f)
                for species, countries in content.items():
                    if folder_name == "countries":
                        data.append({"species": species, "countries": countries})
                    elif folder_name == "country_counts":
                        for country, count in countries.items():
                            data.append(
                                {
                                    "species": species,
                                    "country": country,
                                    "count": count,
                                }
                            )

    return pd.DataFrame(data)


project_path = "~/p-dsgt_clef2025-0/shared/plantclef"
folder_name = "country_counts"  # folder_name: "countries" or "country_counts"
df = load_species_country_counts_df(base_path=project_path, folder_name=folder_name)
df.head()

Unnamed: 0,species,country,count
0,Omalotheca supina (L.) DC.,United Kingdom of Great Britain and Northern I...,4
1,Omalotheca supina (L.) DC.,Sweden,61
2,Omalotheca supina (L.) DC.,Norway,85
3,Omalotheca supina (L.) DC.,Germany,3
4,Omalotheca supina (L.) DC.,Ukraine,3


In [9]:
len(df)

83988

In [10]:
# group df by species and count occurrences
grouped_df = (
    df.groupby("species").agg({"country": "unique", "count": "sum"}).reset_index()
)
grouped_df.head()

df = load_species_country_counts_df(project_path, folder_name="countries")
france_df = df[df["countries"].apply(lambda x: "France" in x)]
num_species_france = france_df.shape[0]
print(f"Number of species with occurrences in France: {num_species_france}")
france_df.head()

Number of species with occurrences in France: 5185


Unnamed: 0,species,countries
0,Omalotheca supina (L.) DC.,"[Germany, Sweden, United Kingdom of Great Brit..."
1,"Myriolimon ferulaceum (L.) Lledó, Erben & M.B....","[Spain, Portugal, France]"
6,Ranunculus muricatus L.,"[Chile, Türkiye, China, Bermuda, Croatia, Fran..."
7,Galium anglicum Huds.,"[Germany, Morocco, France, Italy, United State..."
10,Sisymbrium irio L.,"[Unknown, Ecuador, Algeria, United Kingdom of ..."


In [11]:
# add species_id to frances_df
metadata_path = f"{project_path}/data/species_metadata.csv"
metadata_df = pd.read_csv(metadata_path)
# join metadata_df with france_df
joined_df = france_df.merge(metadata_df, on="species", how="left")
print(f"Number of rows: {len(joined_df)}")
joined_df.head()

Number of rows: 5185


Unnamed: 0,species,countries,species_id,genus,family
0,Omalotheca supina (L.) DC.,"[Germany, Sweden, United Kingdom of Great Brit...",1412857,Omalotheca,Asteraceae
1,"Myriolimon ferulaceum (L.) Lledó, Erben & M.B....","[Spain, Portugal, France]",1389796,Myriolimon,Plumbaginaceae
2,Ranunculus muricatus L.,"[Chile, Türkiye, China, Bermuda, Croatia, Fran...",1363972,Ranunculus,Ranunculaceae
3,Galium anglicum Huds.,"[Germany, Morocco, France, Italy, United State...",1738582,Galium,Rubiaceae
4,Sisymbrium irio L.,"[Unknown, Ecuador, Algeria, United Kingdom of ...",1363896,Sisymbrium,Brassicaceae


In [12]:
# import aggregation classification data
file_name = "agg_topk10_dsgt_run_topk_9_species_grid_6x6.csv"
submission_path = f"{project_path}/submissions/aggregation_seasons/test_2025"
input_path = f"{submission_path}/{file_name}"
agg_df = pd.read_csv(input_path)
agg_df.head()

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1360187, 1392662, 1395045, 1738679, 1362443, ..."
1,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
2,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
3,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
4,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."


In [13]:
import ast


def filter_species_by_country(
    joined_df: pd.DataFrame,
    agg_df: pd.DataFrame,
) -> pd.DataFrame:
    agg_df = agg_df.copy()
    agg_df["species_ids"] = agg_df["species_ids"].apply(lambda x: ast.literal_eval(x))

    # get valid species_ids from joined_df
    valid_species_ids = joined_df["species_id"].tolist()

    # filter each list of species_ids in agg_df
    agg_df["species_ids"] = agg_df["species_ids"].apply(
        lambda id_list: [
            species_id for species_id in id_list if species_id in valid_species_ids
        ]
    )

    return agg_df


final_df = filter_species_by_country(joined_df, agg_df)
final_df.head()

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1360187, 1392662, 1395045, 1738679, 1362443, ..."
1,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
2,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
3,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
4,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."


In [14]:
def analyze_species_filtering(original_df: pd.DataFrame, filtered_df: pd.DataFrame):
    # Original species counts per row
    original_df = original_df.copy()
    original_df["species_ids"] = original_df["species_ids"].apply(
        lambda x: ast.literal_eval(x)
    )
    original_counts = original_df["species_ids"].apply(len)

    # Filtered species counts per row
    filtered_counts = filtered_df["species_ids"].apply(len)

    print("Species Count Per Quadrat (Before Filtering):")
    print(f"Total Species IDs: {original_counts.sum()}")
    print(f"Mean: {original_counts.mean():.2f}")
    print(f"Min: {original_counts.min()}")
    print(f"Max: {original_counts.max()}")

    print("\nSpecies Count Per Quadrat (After Filtering by France):")
    print(f"Total Species IDs: {filtered_counts.sum()}")
    print(f"Mean: {filtered_counts.mean():.2f}")
    print(f"Min: {filtered_counts.min()}")
    print(f"Max: {filtered_counts.max()}")

    print("\nReduction in Species IDs:")
    print(f"Total Reduction: {original_counts.sum() - filtered_counts.sum()}")
    print(
        f"Avg Reduction per Quadrat: {(original_counts - filtered_counts).mean():.2f}"
    )

In [15]:
# Make sure agg_df has been converted using ast.literal_eval already
original_agg_df = agg_df.copy(deep=True)
filtered_agg_df = filter_species_by_country(joined_df, agg_df)

# Run analysis
analyze_species_filtering(original_agg_df, filtered_agg_df)

Species Count Per Quadrat (Before Filtering):
Total Species IDs: 18629
Mean: 8.85
Min: 1
Max: 10

Species Count Per Quadrat (After Filtering by France):
Total Species IDs: 15626
Mean: 7.42
Min: 0
Max: 10

Reduction in Species IDs:
Total Reduction: 3003
Avg Reduction per Quadrat: 1.43


In [16]:
filtered_agg_df.head()

Unnamed: 0,quadrat_id,species_ids
0,2024-CEV3-20240602,"[1360187, 1392662, 1395045, 1738679, 1362443, ..."
1,CBN-PdlC-A1-20130807,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
2,CBN-PdlC-A1-20130903,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
3,CBN-PdlC-A1-20140721,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."
4,CBN-PdlC-A1-20140811,"[1392608, 1742052, 1412857, 1392407, 1392611, ..."


In [17]:
import csv
from pathlib import Path


def format_species_ids(species_ids: list) -> str:
    """Formats the species IDs in single square brackets, separated by commas."""
    formatted_ids = ", ".join(str(id) for id in species_ids)
    return f"[{formatted_ids}]"


def prepare_and_write_submission(pandas_df: pd.DataFrame) -> pd.DataFrame:
    """Converts Spark DataFrame to Pandas, formats it, and writes to GCS."""
    records = []
    for _, row in pandas_df.iterrows():
        logits = row["species_ids"]
        formatted_species = format_species_ids(logits)
        records.append(
            {"quadrat_id": row["quadrat_id"], "species_ids": formatted_species}
        )

    pandas_df = pd.DataFrame(records)
    return pandas_df


def get_plantclef_dir() -> str:
    home_dir = Path(os.path.expanduser("~"))
    return f"{home_dir}/p-dsgt_clef2025-0/shared/plantclef/"


def write_csv_to_pace(df, file_name: str, testset_name: str):
    """Writes the Pandas DataFrame to a CSV file on PACE."""

    # prepare and write the submission
    submission_df = prepare_and_write_submission(df)
    project_dir = get_plantclef_dir()
    submission_path = f"{project_dir}/submissions/aggregation_seasons/{testset_name}"
    output_path = f"{submission_path}/{file_name}"
    # ensure directory exists before saving
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # write to CSV
    submission_df.to_csv(output_path, sep=",", index=False, quoting=csv.QUOTE_ALL)
    print(f"Submission file saved to: {output_path}")


def main(df_final: pd.DataFrame, folder_name: str, testset_name: str = "test_2025"):
    # define the file name
    FILE_NAME = f"dsgt_run_{folder_name}.csv"

    sub_file_name = f"france_agg_topk10_{FILE_NAME}"
    write_csv_to_pace(df_final, sub_file_name, testset_name)

In [18]:
folder_name = "topk_9_species_grid_6x6"
main(filtered_agg_df, folder_name, testset_name="test_2025")

Submission file saved to: /storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/plantclef//submissions/aggregation_seasons/test_2025/france_agg_topk10_dsgt_run_topk_9_species_grid_6x6.csv
