In [16]:
import os
import numpy as np
from numpy.typing import ArrayLike
import pandas as pd
import csv

In [17]:
MAX_PREDICTIONS = 10
MIN_PROBABILITY = 0.02

In [18]:
# Directory containing the .npy files
directories = [
'./predictions/probabilities_submission_5h1l_tile_4_5_overlaps_0_0_use_gf_crop_010',
'./predictions/probabilities_submission_hydra_5h1l_s_5h2l_gf_tile_45_00_usegf_crop10',
]

all_quadrat_probs: list[dict[str, ArrayLike]] = []

for directory in directories:
    # List to store the loaded data
    quadrat_probs: dict[str, ArrayLike] = {}

    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.npy'):
            file_path = os.path.join(directory, filename)
            quadrat_id = filename.split('.')[0]
            tile_probabilities = np.load(file_path)

            quadrat_probs[quadrat_id] = tile_probabilities

    all_quadrat_probs.append(quadrat_probs)

In [19]:
from src import data

plant_data_image_info, rare_species = data.get_plant_data_image_info(
    os.path.join(
        "/mnt/storage1/shared_data/plant_clef_2025/",
        "data/",
        "plant_clef_train_281gb/",
    ),
)

species_id_to_index = {
            sid: idx
            for idx, sid in enumerate(
                sorted({info.species_id for info in plant_data_image_info})
            )
        }
species_index_to_id = {idx: sid for sid, idx in species_id_to_index.items()}

In [20]:
image_predictions: dict[str, list[int]] = {}

assert all(quadrat_probs.keys() == all_quadrat_probs[0].keys() for quadrat_probs in all_quadrat_probs[1:]), "All quadrat probabilities should have the same keys"

quadrat_probs = {quadrat_id: sum([quadrat_prob[quadrat_id] / len(all_quadrat_probs) for quadrat_prob in all_quadrat_probs]) for quadrat_id in all_quadrat_probs[0].keys()}

for quadrat_id, tile_probabilities in quadrat_probs.items():
    top_species = set()
    for tile_idx, tile_probs in enumerate(tile_probabilities):
        max_index = tile_probs.argmax()
        if tile_probs[max_index] < MIN_PROBABILITY: continue
        top_species.add(max_index)
    image_predictions[quadrat_id] = list(top_species)
    if len(image_predictions[quadrat_id]) == 0:
        column_sums = np.sum(tile_probabilities, axis=0)
        image_predictions[quadrat_id] = [column_sums.argmax()]
    elif len(image_predictions[quadrat_id]) > MAX_PREDICTIONS:
        column_sums = np.sum(tile_probabilities, axis=0)
        relevant_sums = [column_sums[i] for i in image_predictions[quadrat_id]]
        top_indices = np.argsort(relevant_sums)[-MAX_PREDICTIONS:]
        image_predictions[quadrat_id] = [
            list(image_predictions[quadrat_id])[i] for i in top_indices
        ]

In [21]:
for quadrat_id in image_predictions.keys():
    image_predictions[quadrat_id] = [
        species_index_to_id[idx] for idx in image_predictions[quadrat_id]
    ]

In [22]:
df_run = pd.DataFrame(
    list(image_predictions.items()),
    columns=[
        "quadrat_id",
        "species_ids",
    ],
)
df_run["species_ids"] = df_run["species_ids"].apply(str)
df_run.to_csv(
    "./predictions/submission.csv",
    sep=",",
    index=False,
    quoting=csv.QUOTE_ALL,
)