In [19]:
import os
import numpy as np
from numpy.typing import ArrayLike
import pandas as pd
import csv

In [20]:
# Directory containing the .npy files
directory = './predictions/probabilities'



# List to store the loaded data
quadrat_probs: dict[str, ArrayLike] = {}

# Iterate through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.npy'):
        file_path = os.path.join(directory, filename)
        quadrat_id = filename.split('.')[0]
        tile_probabilities = np.load(file_path)

        quadrat_probs[quadrat_id] = tile_probabilities

In [21]:
from src import data

plant_data_image_info, rare_species = data.get_plant_data_image_info(
    os.path.join(
        "/mnt/storage1/shared_data/plant_clef_2025/",
        "data/",
        "plant_clef_train_281gb/",
    ),
)

species_id_to_index = {
            sid: idx
            for idx, sid in enumerate(
                sorted({info.species_id for info in plant_data_image_info})
            )
        }
species_index_to_id = {idx: sid for sid, idx in species_id_to_index.items()}

In [None]:
image_predictions: dict[str, list[int]] = {}

for quadrat_id, tile_probabilities in quadrat_probs.items():
    column_sums = tile_probabilities.sum(axis=0)
    image_predictions[quadrat_id] = np.where(column_sums > 0.05)[0].tolist()
    if len(image_predictions[quadrat_id]) == 0:
        image_predictions[quadrat_id] = [column_sums.argmax()]

In [23]:
for quadrat_id in image_predictions.keys():
    image_predictions[quadrat_id] = [
        species_index_to_id[idx] for idx in image_predictions[quadrat_id]
    ]

In [24]:
df_run = pd.DataFrame(
    list(image_predictions.items()),
    columns=[
        "quadrat_id",
        "species_ids",
    ],
)
df_run["species_ids"] = df_run["species_ids"].apply(str)
df_run.to_csv(
    "./predictions/submission.csv",
    sep=",",
    index=False,
    quoting=csv.QUOTE_ALL,
)