In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import rasterio
import random
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support

import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
path_data = "/home/gt/DATA/geolifeclef-2025"
nc = 69
ns = 2519
nu = 100
nf = 10
n_samples = 100
thin = 1000
model_type_string = f"nc{nc:04d}_ns{ns:04d}_np{nu:04d}"
pred_filename = f"pred_{model_type_string}_sam{n_samples:04d}_thin{thin:04d}.csv"
test_pred = pd.read_csv(os.path.join(path_data, "hmsc", "pred", pred_filename))

In [3]:
def f1_score(outputs, targets=None, M=400, mult=1, offset=0, device=torch.device("cpu")):
    num_classes = outputs.shape[-1]
    prob_ord = torch.argsort(outputs, dim=-1, descending=True)
    log_prob = torch.gather(outputs, -1, prob_ord)
    sample = torch.bernoulli(torch.sigmoid(log_prob).repeat(M, 1, 1))
    cum_sum = torch.cat([torch.zeros([M,outputs.shape[0],1],device=device), torch.cumsum(sample, -1)], -1)
    rev_cum_sum = torch.cat([torch.flip(torch.cumsum(torch.flip(sample, [-1]), -1), [-1]), torch.zeros([M,outputs.shape[0],1],device=device)], -1)
    f1_values = cum_sum / (cum_sum + 0.5*(torch.arange(num_classes+1,device=device)-cum_sum) + 0.5*rev_cum_sum)
    f1_values[:,:,0] = 0
    f1_expected = torch.nanmean(f1_values, 0)
    # print(f1_expected)
    pred_num = torch.argmax(f1_expected, -1)
    pred_num = torch.maximum(torch.minimum(torch.round(mult*pred_num+offset), torch.Tensor([outputs.shape[-1]]).to(device)), torch.zeros([1],device=device)).int()
        
    if targets is None:
        pred_list = [prob_ord[i,:pred_num[i]] for i in range(outputs.shape[0])]
        return pred_list
    else:
        pred_list = [torch.cat([torch.ones([pred_num[i]],device=device), torch.zeros([outputs.shape[-1]-pred_num[i]],device=device)]) for i in range(outputs.shape[0])]
        targets_ordered = torch.gather(targets, -1, prob_ord)
        pred_ordered = torch.stack(pred_list)
        f1 = torch.sum(torch.logical_and(targets_ordered, pred_ordered), -1) / torch.sum(torch.logical_or(targets_ordered, pred_ordered), -1)
        return f1

In [4]:
pa_presence_threshold = 1
train_path_sentinel = os.path.join(path_data, "SatelitePatches/PA-train")
train_path_landsat = os.path.join(path_data, "SateliteTimeSeries-Landsat/cubes/PA-train")
train_path_bioclim = os.path.join(path_data, "BioclimTimeSeries/cubes/PA-train")
train_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_train.csv"))
train_metadata = train_metadata.dropna(subset="speciesId").reset_index(drop=True)
train_metadata['speciesId'] = train_metadata['speciesId'].astype(int)
train_metadata["speciesIdOrig"] = train_metadata['speciesId']
tmp = train_metadata["speciesId"].value_counts() >= pa_presence_threshold
train_metadata.loc[~train_metadata["speciesId"].isin(tmp[tmp].index), "speciesId"] = -1
sp_categorical = train_metadata["speciesId"].astype("category").values
num_classes = len(sp_categorical.categories)
train_metadata['speciesId'] = sp_categorical.codes
test_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_test.csv")).set_index("surveyId", drop=False).sort_index()

In [5]:
batch_size = 64

class PredDataset(Dataset):
    def __init__(self, pred):
        self.pred = pred

    def __len__(self):
        return self.pred.shape[0]

    def __getitem__(self, idx):
        return self.pred.iloc[idx].values

test_loader = DataLoader(PredDataset(test_pred), batch_size=batch_size, shuffle=False, num_workers=1)

In [6]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("DEVICE = CUDA")

DEVICE = CUDA


In [7]:
with torch.no_grad():
    top_indices = []
    for outputs in tqdm(test_loader, total=len(test_loader),  desc="prediction"):
        outputs = torch.logit(outputs.to(device))
        top_batch_list_orig = f1_score(outputs, None, device=device)
        top_batch_list_proc = [np.sort(sp_categorical.categories[pred.cpu().numpy()]) for pred in top_batch_list_orig]
        top_indices += top_batch_list_proc

prediction:   0%|          | 0/231 [00:00<?, ?it/s]

In [8]:
outputs

tensor([[-6.9068,    -inf, -2.7166,  ...,    -inf,    -inf,    -inf],
        [   -inf,    -inf, -6.9068,  ...,    -inf,    -inf,    -inf],
        [   -inf,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
        ...,
        [   -inf,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
        [   -inf,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
        [   -inf,    -inf,    -inf,  ...,    -inf,    -inf,    -inf]],
       device='cuda:0', dtype=torch.float64)

In [9]:
data_concatenated = [' '.join(map(str, row)) for row in top_indices]
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
res = pd.DataFrame({'surveyId': test_metadata.surveyId.values, 'predictions': data_concatenated})

In [10]:
os.makedirs(os.path.join(path_data, "hmsc", "result"), exist_ok=True)
timestamp = datetime.now().strftime('%m%d_%H%M%S')
res_filename = f"{timestamp}_res_{model_type_string}_sam{n_samples:04d}_thin{thin:04d}.csv"
res.to_csv(os.path.join(path_data, "hmsc", "result", res_filename), index=False)

In [11]:
res

Unnamed: 0,surveyId,predictions
0,642,53 254 300 791 843 958 963 976 1264 1707 1888 ...
1,1792,262 351 392 462 1018 1051 1092 1254 1712 2492 ...
2,3256,4 96 1678 2184 2210 2630 4842 5055 6053 6407 6...
3,3855,4 1875 2184 2630 3211 4109 4590 4686 6053 6407...
4,4889,53 254 651 843 958 963 1539 1910 1964 2025 282...
...,...,...
14779,5010108,305 441 543 623 735 875 896 1063 1180 1214 142...
14780,5010109,305 623 875 1014 1063 1924 2004 2257 2471 2608...
14781,5010110,305 623 1014 1063 1214 1924 2004 2257 2471 260...
14782,5010111,305 623 846 1014 1063 1794 1924 2004 2257 2471...


In [1]:
13173.1 / 3600

3.6591944444444446